###################################
## Refugees & Civil Conflict     ##
## Final Data Generation Code    ##
## Zhou, Shaver                  ##
## Last Updated: 12/04/2020      ##
###################################

# Load required packages:
library(raster)
library(rgeos)
library(data.table)
library(readxl)
library(stringr)
library(plyr)
library(dplyr)
library(lubridate)
library(DataCombine)
library(swfscMisc)
library(sp)

# Import data:
# Yearly country borders:
countries <- shapefile("cshapes_0/cshapes.shp")
# 1998 province boundaries:
shape <- shapefile("world_admin_1998.shp")
# SJM has two observations in ruggedness data because Svalbard and Jan Mayen are listed under separate
# contry names but not separate country codes.
# So drop Jan Mayen, which has no population:
shape <- shape[shape@data$CNTRY_NAME!= "Jan Mayen", ]
# Global UTM grid .shp file:
utm <- shapefile("World_UTM_Grid/World_UTM_Grid.shp")
# UCDP GED:
ged <- read.csv("ged191.csv")
# UCDP GED separate Syria data:
ged_syria <- read.csv("ged_syria.csv")
# PRIO Conflict Site (1989–2008) data:
conflict_sites <- read.csv("ConflictSite 4-2010_v3 Dataset.csv")
conflict_sites <- conflict_sites[, 1:9]
# 2020 UNHCR Camps data:
locations <- read_excel("wrl_ppl_poc_07_01_2020.xls")
# PRIO-GRID:
prio_grid <- read.csv("prio_grid_1989_2014.csv")
# PRIO-GRID centroids:
prio_grid_centroids <- shapefile("priogrid_shapefiles/priogrid_centroid.shp")
# PRIO-GRID cells:
prio_grid_cells <- shapefile("priogrid_shapefiles/priogrid_cell.shp")
# Ruggedness data:
ruggedness <- read.csv("Ruggedness_Index_world_admin_1998.csv", header = TRUE)
# Adjacency matrix:
load("regionAdjMat.RData")

# Now, add Syria data to larger GED dataset:
# First, drop column 10, which does not appear in the Syria data:
which(colnames(ged) %in% colnames(ged_syria))
ged <- ged[, -10]
# Next, note that some of the "points" in the Syria data are not specific to any
# part of the country but instead identify Syria as the location and give coordinates
# that appear to correspond to the centroid of the country; thus, remove these
# obervations:
ged_syria <- ged_syria[ged_syria$latitude != 35 & ged_syria$longitude != 38, ]
# Finally, drop Syrian events for the year 2019:
ged_syria <- ged_syria[ged_syria$year < 2019, ]
# Then, combine the datasets:
ged <- rbind(ged, ged_syria)

# Now, generate country-year dataset of territorial conflict extent -- 

# First, subset both country boundary shapefile and GED to the post 1989 period:
ged_90 <- ged[ged$year > 1989, ]
countries_90 <- countries[countries@data$COWEYEAR > 1989,  ]

# For Germany in 1990, simply use current German boundaries:
countries_90 <- countries_90[!(countries_90@data$CNTRY_NAME %in% c("Germany Democratic Republic", "Germany Federal Republic")),  ]

# Next, modify country-boundary shapefile so that no two overlapping boundaries share a year
# of coverage. (This occurs when a territorial change occurs mid-year.)
# Case by case:
# Ethiopia -- 
# Eritrea becomes independent nation in May of 1993; therefore, set Ethopia's 
# previous border to ending in 1992.
countries_90@data[countries_90@data$FEATUREID == 195, ]$COWEYEAR <- 1992
# Indonesia -- 
# "In 2002, Indonesia lost two islands to Malaysia - after the International 
# Court of Justice ruled against it in a territorial dispute - and two to East Timor 
# when it became independent." (https://www.bbc.com/news/world-asia-40168981)
countries_90@data[countries_90@data$FEATUREID == 212, ]$COWEYEAR <- 2001
# Saudi Arabia -- 
# resolves border disputes with Yemen in 2000. Territory appears to change somewhat.
countries_90@data[countries_90@data$FEATUREID == 239, ]$COWEYEAR <- 1999
# Serbia; Serbia and Montenegro -- 
# In 2008, Serbia's territory decrease when Kosovo declares independence:
countries_90@data[countries_90@data$FEATUREID == 234, ]$COWEYEAR <- 2007
# In 2006, Montenegro votes on independence:
countries_90@data[countries_90@data$FEATUREID == 231, ]$COWEYEAR <- 2005
# South Africa -- 
# South West Africa becomes independent of South Africa in 1990; therefore, 
# drop pre-1990 map from dataset and simply use current bounaries:
countries_90 <- countries_90[!(countries_90@data$FEATUREID %in% 196),  ]
# Sudan -- 
# South Sudan gain independence from Sudan:
countries_90@data[countries_90@data$FEATUREID == 58, ]$COWEYEAR <- 2010
# Tanzania -- 
# capital city is changed from Dar es Salaam to Dodoman (no apparent territorial change)
# For now, drop one (so that events are not attached to multiple boundaries) but adjust
# start year:
countries_90 <- countries_90[countries_90@data$FEATUREID != 159,  ]
countries_90@data[countries_90@data$FEATUREID == 242, ]$COWSYEAR <- 1990
# USSR, Russia -- 
# In 1991, USSR undergoes territorial changes:
countries_90@data[countries_90@data$FEATUREID == 192, ]$COWEYEAR <- 1990
# On Dec 1991, Russia becomes independent country again; set to 1992:
countries_90@data[countries_90@data$FEATUREID == 65, ]$COWSYEAR <- 1992
# Yemen -- 
# Yemen unification occurs in 1990; so, simply drop Yemen Arab Republic and Yemen People's 
# Republic:
countries_90 <- countries_90[!(countries_90@data$FEATUREID %in% c(200, 201)),  ]
# Then, in 2000, Yemen's territory changes slightly with border agreement with Saudi Arabia:
countries_90@data[countries_90@data$FEATUREID == 240, ]$COWEYEAR <- 1999
# Yugoslavia -- 
countries_90@data[countries_90@data$FEATUREID == 191, ]$COWEYEAR <- 1991
# Don't fully understand the Yugoslavia shapefile running from 1992 though 1993 given
# that a separate Serbia and Montenegro shapefile begins in 1992. So, that points aren't 
# captured more than once, dropping that shapefile:
countries_90@data[countries_90@data$FEATUREID != 233, ]
# Finally, to match COW naming conventions, change "Serbia and Montenegro"
# and "Serbia" to "Yugoslavia":
countries_90@data[countries_90@data$CNTRY_NAME == "Serbia", ]$CNTRY_NAME <- "Yugoslavia"
countries_90@data[countries_90@data$CNTRY_NAME == "Serbia and Montenegro", ]$CNTRY_NAME <- "Yugoslavia"

# If COWEYEAR in boundaries dataset is equal to 2016 (the last year in cShapes), adjust to
# 2018 (the last year of our study period):
countries_90@data[countries_90@data$COWEYEAR == 2016, ]$COWEYEAR <- 2018

# Now, match province boundaries to each yearly country map from 1990 through
# 2018:

# First, extract the centroids of each province:
shape_centroids <- gCentroid(shape, byid=T)

plot(shape_centroids)
plot(countries_90[countries_90@data$COWSYEAR <= 1989 + 1 & 
                    countries_90@data$COWEYEAR >= 1989 + 1, ], add=T)

rownames(countries_90@data) <- 1:nrow(countries_90@data)

# Now, match each province to its relevant country boundary:
list_year <- list()
for(i in 1:length(1990:2018)){
  list_year[[i]] <- over(shape_centroids, countries_90[countries_90@data$COWSYEAR <= 1989 + i & 
                                                       countries_90@data$COWEYEAR >= 1989 + i, ])
  list_year[[i]] <- cbind(shape@data[, c("GMI_ADMIN", "GMI_CNTRY", "CNTRY_NAME", "ADMIN_NAME")], list_year[[i]])
  # Drop Antarctica, Greenland, and United States Minor Outlying Islands:
  list_year[[i]] <- list_year[[i]][!(list_year[[i]]$GMI_CNTRY %in% c("ATA", "GRL", "UMI")), ]
  row.names(list_year[[i]]) <- 1:nrow(list_year[[i]])
  # Add year indicator:
  list_year[[i]]$Year <- 1989 + i
}

# Now, explore matches for the first year (1990):
# Which cases are unmatched?
unmatched <- list_year[[1]][is.na(list_year[[1]][, 5]), ]
# Observations: 
# 1) Almost all unmatched provinces are island states. 
# 2) The province .shp file contains many more countries than the cShapes .shp file. 
length(unique(shape@data$CNTRY_NAME))
length(unique(countries_90@data$CNTRY_NAME))
# Therefore:
# 1) Match the non-island provinces that weren't previously matched, and 
# 2) Match islands belonging to other states to those states (e.g. Hawaii to United States), and
# 3) Drop those island states that aren't included in cShapes. 

# To do this, run loop, matching each unmatched province for each year to 
# its nearest country:
list_year_unmatched <- list()
for(i in 1:length(1990:2018)){
  # Extract centroids of unmatched provinces:
  unmatched <- gCentroid(shape[shape@data$GMI_ADMIN %in% 
                                 list_year[[i]][is.na(list_year[[i]][, 5]), ]$GMI_ADMIN, ], byid = T)
  # Create data frame of unmatched provinces:
  list_year_unmatched[[i]] <- list_year[[i]][is.na(list_year[[i]][, 5]), ]
  row.names(list_year_unmatched[[i]]) <- 1:nrow(list_year_unmatched[[i]])

    # Then, for each unmatched province for a given year --
    for (j in 1:length(unmatched)){
    # Find the country to which it is closest:
    
    # Project the spatial data so that distances can be measured:
    # To do this, we must first match the spatial data to the relevant UTM zones so that
    # distances can be accurately measured:
    
    crs(unmatched) <- crs(utm)
    crs(unmatched) <- crs(countries_90) <- paste("+proj=utm +zone=", 
                                                 over(unmatched[j,], utm)[2], 
                                                 sep = "")
    temp <- which.min(gDistance(unmatched[j,], countries_90, byid=TRUE))
    # Then, add that country name to the data frame:
    list_year_unmatched[[i]][j, 5] <- countries_90@data[temp, ]$CNTRY_NAME
    # Now, if the country names do not match, then the island state either is not
    # included in cShapes or it is located closer to another country than it's actual
    # country. To correct this, for islands that are part of other countries, simply
    # use the already available country information associated with the province:
    if(grepl("-", list_year_unmatched[[i]][j, 1], fixed=TRUE)){
      list_year_unmatched[[i]][j, 5] <- list_year_unmatched[[i]][j, 3]
    }
  }
  # Then, drop those islands not belonging to other countries (except Comoros):
  list_year_unmatched[[i]]$independent <- NA
  list_year_unmatched[[i]]$independent <- ifelse(nchar(list_year_unmatched[[i]]$GMI_ADMIN) < 4 & 
                                                   list_year_unmatched[[i]]$GMI_ADMIN != "COM", 1, 0)
  list_year_unmatched[[i]] <- list_year_unmatched[[i]][list_year_unmatched[[i]]$independent %in% 0, ]
  list_year_unmatched[[i]]$independent <- NULL
}

# Now, create subset of the previously matched provinces only:
for(i in 1:length(1990:2018)){
  list_year[[i]] <- list_year[[i]][!(is.na(list_year[[i]][, 5])), ]
}

dataset_matched <- rbindlist(list_year)
dataset_unmatched <- rbindlist(list_year_unmatched)

# Then, construct full dataset:
dataset <- rbind(dataset_matched, dataset_unmatched)

# Then, correct provinces where country name is incorrectly assigned as a function
# of the matching process:
colnames(dataset)[5] <- "Country"
# Afghanistan's Badakhshan province:
dataset$Country <- ifelse(dataset$ADMIN_NAME == "Badakhshan", "Afghanistan", as.character(dataset$Country))
# Bangladesh's Chittagong province:
dataset$Country <- ifelse(dataset$ADMIN_NAME == "Chittagong", "Bangladesh", as.character(dataset$Country))
# Kyrgyzstan's Osh province:
dataset$Country <- ifelse(dataset$ADMIN_NAME == "Osh", "Kyrgyzstan", as.character(dataset$Country))
# Croatia:
dataset$Country <- ifelse(dataset$ADMIN_NAME == "Croatia" & dataset$Year > 1990, "Croatia", as.character(dataset$Country))

# Next, add civil war data to panel:

# Match the GED long-lat coordinates to the provinces:
points <- ged_90[, c("longitude", "latitude")]
coordinates(points) <- ~longitude + latitude
# Put coordinates in same projection as the provinces:
projection(points) <- crs(shape)
# Merge the datasets:
ged_90 <- cbind(ged_90, over(points, shape))

# # # 
# Are all points matched to a province? 
nrow(ged_90[is.na(ged_90$GMI_ADMIN), ]) / nrow(ged_90)
# # No; 1.1% not.
# Plot the unmatched points to see where they are:
unmatched_points <- ged_90[is.na(ged_90$GMI_ADMIN), c("longitude", "latitude")]
coordinates(unmatched_points) <- ~longitude + latitude
projection(unmatched_points) <- crs(shape)
plot(shape)
plot(unmatched_points, col=rgb(red=0.1, green=0.1, blue=1.0, alpha=0.1), add=T)
# Virtually all appaer to fall just off coastlines. To remedy, use nearst distance approach 
# to match violence to match the remaining unmatched points to the nearest province:
violence_unmatched <- ged_90[is.na(ged_90$GMI_ADMIN), ]

for(i in 1:length(unmatched_points)){
  crs(unmatched_points) <- crs(utm)
  crs(unmatched_points) <- crs(shape) <- paste("+proj=utm +zone=", 
                                               over(unmatched_points[i,], utm)[2], 
                                               sep = "")
  temp <- which.min(gDistance(unmatched_points[i,], shape, byid=TRUE))
  violence_unmatched[i, colnames(shape@data)] <- 
    shape@data[temp, colnames(shape@data)]
}

# Next, combine originally matched points with the points later matched:
ged_90 <- rbind(ged_90[!(is.na(ged_90$GMI_ADMIN)), ], violence_unmatched) 
# # # NOTE: Describe this process in footnote.

# Aggregate attacks:
ged_90$incident <- 1
# All
ged_py_attacks <- ged_90 %>%
  group_by(GMI_ADMIN, year) %>%
  summarise(attack = sum(incident, na.rm = T))
# State-Based:
ged_py_attacks_state_based <- ged_90[ged_90$type_of_violence %in% 1, ] %>%
  group_by(GMI_ADMIN, year) %>%
  summarise(attack_state_based = sum(incident, na.rm = T))
# Non-State:
ged_py_attacks_non_state <- ged_90[ged_90$type_of_violence %in% 2, ] %>%
  group_by(GMI_ADMIN, year) %>%
  summarise(attack_non_state = sum(incident, na.rm = T))
# One-Sided:
ged_py_attacks_one_sided <- ged_90[ged_90$type_of_violence %in% 3, ] %>%
  group_by(GMI_ADMIN, year) %>%
  summarise(attack_one_sided = sum(incident, na.rm = T))

# Battledeaths:
# All:
ged_py <- ged_90 %>%
  group_by(GMI_ADMIN, year) %>%
  summarise(best = sum(best, na.rm = T))
# State-Based:
ged_py_state_based <- ged_90[ged_90$type_of_violence %in% 1, ] %>%
  group_by(GMI_ADMIN, year) %>%
  summarise(best_state_based = sum(best, na.rm = T))
# Non-State:
ged_py_non_state <- ged_90[ged_90$type_of_violence %in% 2, ] %>%
  group_by(GMI_ADMIN, year) %>%
  summarise(best_non_state = sum(best, na.rm = T)) 
# One-Sided:
ged_py_one_sided <- ged_90[ged_90$type_of_violence %in% 3, ] %>%
  group_by(GMI_ADMIN, year) %>%
  summarise(best_one_sided = sum(best, na.rm = T))

# Then, match this data to the full panel:
colnames(ged_py_attacks)[2] <- colnames(ged_py_attacks_state_based)[2] <- 
  colnames(ged_py_attacks_non_state)[2] <- colnames(ged_py_attacks_one_sided)[2] <- 
  colnames(ged_py)[2] <- colnames(ged_py_state_based)[2] <- 
  colnames(ged_py_non_state)[2] <- colnames(ged_py_one_sided)[2] <- "Year"

ged_py_attacks$Year <- ymd(paste(ged_py_attacks$Year, "01", "01", sep = "-"))
ged_py_attacks_state_based$Year <- ymd(paste(ged_py_attacks_state_based$Year, "01", "01", sep = "-"))
ged_py_attacks_non_state$Year <- ymd(paste(ged_py_attacks_non_state$Year, "01", "01", sep = "-"))
ged_py_attacks_one_sided$Year <- ymd(paste(ged_py_attacks_one_sided$Year, "01", "01", sep = "-"))
ged_py$Year <- ymd(paste(ged_py$Year, "01", "01", sep = "-"))
ged_py_state_based$Year <- ymd(paste(ged_py_state_based$Year, "01", "01", sep = "-"))
ged_py_non_state$Year <- ymd(paste(ged_py_non_state$Year, "01", "01", sep = "-"))
ged_py_one_sided$Year <- ymd(paste(ged_py_one_sided$Year, "01", "01", sep = "-"))
dataset$Year <- ymd(paste(dataset$Year, "01", "01", sep = "-"))
dataset <- join(dataset, ged_py_attacks, by = c("GMI_ADMIN", "Year"))
dataset <- join(dataset, ged_py_attacks_state_based, by = c("GMI_ADMIN", "Year"))
dataset <- join(dataset, ged_py_attacks_non_state, by = c("GMI_ADMIN", "Year"))
dataset <- join(dataset, ged_py_attacks_one_sided, by = c("GMI_ADMIN", "Year"))
dataset <- join(dataset, ged_py, by = c("GMI_ADMIN", "Year"))
dataset <- join(dataset, ged_py_state_based, by = c("GMI_ADMIN", "Year"))
dataset <- join(dataset, ged_py_non_state, by = c("GMI_ADMIN", "Year"))
dataset <- join(dataset, ged_py_one_sided, by = c("GMI_ADMIN", "Year"))

# Then, convert values of NA to 0s (except for Syria, for which data does not begin
# until 2016):
# Attacks:
dataset$attack[is.na(dataset$attack)] <- 0
dataset$attack_state_based[is.na(dataset$attack_state_based)] <- 0
dataset$attack_non_state[is.na(dataset$attack_non_state)] <- 0
dataset$attack_one_sided[is.na(dataset$attack_one_sided)] <- 0
# Battledeaths:
dataset$best[is.na(dataset$best)] <- 0
dataset$best_state_based[is.na(dataset$best_state_based)] <- 0
dataset$best_non_state[is.na(dataset$best_non_state)] <- 0
dataset$best_one_sided[is.na(dataset$best_one_sided)] <- 0
# Apply Syria correction:
dataset[dataset$Country == "Syria" & 
          dataset$Year < ymd("2016-01-01"), c("attack", "attack_state_based", 
                                              "attack_non_state", "attack_one_sided", 
                                              "best", "best_state_based", 
                                              "best_non_state", "best_one_sided")] <- NA

# Now, restrict dataset to required columns:
dataset <- setDF(dataset)
dataset <- dataset[ , which(!(names(dataset) %in% colnames(countries_90@data)[2:24]))]

# Next, generate binary battledeath threshold -- 25 or more:
# All violence: 
dataset$incidence_2_best <- ifelse(dataset$best >= 25, 1, 0)

# Next, add conflict sites data:

# First, drop 1989 observations from dataset:
conflict_sites <- conflict_sites[conflict_sites$Year > 1989, ]
row.names(conflict_sites) <- 1:nrow(conflict_sites)
# Then, drop cases where no conflict center is specified:
table(conflict_sites$Latitude)
conflict_sites <- conflict_sites[!(conflict_sites$Latitude %in% c(-888, -999)), ]
row.names(conflict_sites) <- 1:nrow(conflict_sites)
# Next, note that there are cases that affect multiple countries:
conflict_sites[which(grepl(",", conflict_sites$Conflict.territory)), 
               c("Year", "Conflict.territory")]
# Year Conflict.territory
# 21  1990    India, Pakistan
# 22  1991    India, Pakistan
# 23  1992    India, Pakistan
# 24  1996    India, Pakistan
# 25  1997    India, Pakistan
# 26  1998    India, Pakistan
# 27  1999    India, Pakistan
# 28  2000    India, Pakistan
# 29  2001    India, Pakistan
# 30  2002    India, Pakistan
# 31  2003    India, Pakistan
# 508 1991       Iraq, Kuwait
# 641 1995      Ecuador, Peru
# 657 1998  Eritrea, Ethiopia
# 658 1999  Eritrea, Ethiopia
# 659 2000  Eritrea, Ethiopia

# For these, create additional entries in the conflict sites dataset so that 
# each observation is associated with a unique country-year:
# First, duplicate the cases with two affected countries:
temp <- conflict_sites[which(grepl(",", conflict_sites$Conflict.territory)),]
# Then, keep only the first country name for the original cases:
conflict_sites$Conflict.territory <- gsub(",.*", "", conflict_sites$Conflict.territory)
# Next, keep only the second country name in the duplicated cases:
temp$Conflict.territory <- gsub(".*, ", "", temp$Conflict.territory)
# Then, combine the two datasets:
conflict_sites <- rbind(conflict_sites, temp)

# Now, are all country names in the conflict sites dataset included in the dataset
# country names?
which(!(unique(conflict_sites$Conflict.territory) %in% unique(dataset$Country)))
# They are not. Which are not?
unique(conflict_sites$Conflict.territory)[which(!(unique(conflict_sites$Conflict.territory) %in% unique(dataset$Country)))]
# [1] "DRC"               "Soviet Union"      "Croatia"           "Comoros"          
# [5] "Republic of Congo" "USA"               "Ivory Coast"  
# Adjust the names to match between datasets:
conflict_sites$Conflict.territory <- ifelse(conflict_sites$Conflict.territory == "DRC", 
                                            "Congo, DRC", conflict_sites$Conflict.territory)
conflict_sites$Conflict.territory <- ifelse(conflict_sites$Conflict.territory == "Soviet Union", 
                                            "USSR", conflict_sites$Conflict.territory)
conflict_sites$Conflict.territory <- ifelse(conflict_sites$Conflict.territory == "Republic of Congo", 
                                            "Congo", conflict_sites$Conflict.territory)
conflict_sites$Conflict.territory <- ifelse(conflict_sites$Conflict.territory == "USA", 
                                            "United States", conflict_sites$Conflict.territory)
conflict_sites$Conflict.territory <- ifelse(conflict_sites$Conflict.territory == "Ivory Coast", 
                                            "Cote d'Ivoire", conflict_sites$Conflict.territory)
unique(conflict_sites$Conflict.territory)[which(!(unique(conflict_sites$Conflict.territory) %in% unique(dataset$Country)))]

# Convert conflict sites year variable:
conflict_sites$Year <- ymd(paste(conflict_sites$Year, "01", "01", sep = "-"))

# Next, construct conflict circles from centroids, noting that given radii are 
# specified in kilometers:
conflict_sites_list <- list()
for(i in 1:nrow(conflict_sites)){
  # First, construct a conflict circle from each centroid:
  temp <- circle.polygon(conflict_sites[i, "Longitude"], 
                         conflict_sites[i, "Latitude"], 
                         conflict_sites[i, "Radius"], brng.limits = 0, 
                         sides = 1, by.length = TRUE,
                         units = "km", ellipsoid = datum(), dist.method = "lawofcosines",
                         destination.type = "ellipsoid", poly.type = "cart.earth")
  # Convert coordinate points to polygon:
  temp <- Polygon(temp)
  temp = Polygons(list(temp),1)
  temp = SpatialPolygons(list(temp))
  # Then, store each conflict circle:
  conflict_sites_list[[i]] <- temp
}

# Then, match each conflict circle to provinces within the relevant country-year:
# First, match each conflict circle to provinces:
intersection_list <- list()
for(i in 1: nrow(conflict_sites)){
  projection(conflict_sites_list[[i]]) <- crs(shape)
  intersection_list[[i]] <- raster::intersect(shape, conflict_sites_list[[i]])
}

# Then, subset the list of matched provinces to only those in the relevant 
# country and output with relevant years and civil war indicator to match to 
# dataset:
affected_provinces <- list()
gmi_admin_country_year <- dataset[, c("GMI_ADMIN", "Country", "Year")]
for(i in 1: nrow(conflict_sites)){
  # For each conflict -- 
  # Identify the year:
  temp_year <- conflict_sites[i, "Year"]
  # Identify the country:
  temp_country <- conflict_sites[i, "Conflict.territory"]
  # Use this information to extract all country-year provinces:
  temp_provinces <- gmi_admin_country_year[gmi_admin_country_year$Year == temp_year &
                                   gmi_admin_country_year$Country == temp_country, ]$GMI_ADMIN
  # Then, take the intersection of country-year provinces and those covered
  # (at least in part) by a conflict circle:
  affected_provinces_temp  <- intersection_list[[i]]@data[which(intersection_list[[i]]@data$GMI_ADMIN %in% temp_provinces), ]$GMI_ADMIN
  # Now, attach the conflict year to these provinces:
  affected_provinces[[i]] <- cbind(as.data.frame(affected_provinces_temp), 
                              as.data.frame(temp_year))
  # Add a civil war indicator:
  affected_provinces[[i]]$civil_war <- 1
  # Finally, adjust the names:
  colnames(affected_provinces[[i]])[1:2] <- c("GMI_ADMIN", "Year")
}

# Now, match this data back to the complete dataset:
# First, combine the list items:
temp <- rbindlist(affected_provinces)
# Next, note that there are sometimes multiple conflicts occurring in a 
# single country during the same year (see, perhaps most notably, India). 
# Thus, first remove the "duplicates":
temp <- temp %>% distinct(GMI_ADMIN, Year, .keep_all = T)
# Finally, join the datasets:
dataset <- join(dataset, temp, type = "left")
# Then, for all years between 1990 and 2008 (the years for which we have conflict
# site data), assign values of NA 0s. Otherwise, leave as NA:
dataset$civil_war <- ifelse(is.na(dataset$civil_war) & 
                              dataset$Year %in% seq(min(conflict_sites$Year), 
                                                    max(conflict_sites$Year), 
                                                    by = "years"), 
                            0, dataset$civil_war)

# Rename civil war variable "incidence":
names(dataset)[names(dataset) == "civil_war"] <- "incidence"

# Then, generate onset and peace years variables (treating conflict as new if 
# the previous year experienced no conflict):
onset_function <- function(x, y){ 
  # x == dataset, y == incidence variable, 
  # First, generate civil war duration variable:
  x$cwdur <- x[, y]
  for(i in 1:(length(x$GMI_ADMIN) - 1)) {
    if(x$cwdur[i] != 0 & x$cwdur[i+1] == 1 & 
       x$GMI_ADMIN[i] == x$GMI_ADMIN[i+1]) {
      x$cwdur[i+1] <- x$cwdur[i] + 1
    } else{
      x$cwdur[i+1] <- x$cwdur[i+1]
    }
  }
  # Then, generate onset variable:
  x$onset <- ifelse(x$cwdur == 0 | x$cwdur > 1, 0, 1)
  # Then, correct civil war onset variable by replacing 'incorrect 0's' with NAs:
  x$onset.n <- ifelse(x[, y] == 1 & x$onset == 0, NA, x$onset)
  # Create peace years:
  x$peaceyrs <- ifelse(x[, y] == 1, 0, 1)
  for(i in 1:(nrow(x) - 1)) {
    if(x$peaceyrs[i] != 0 & x$peaceyrs[i+1] == 1 & 
       x$GMI_ADMIN[i] == x$GMI_ADMIN[i+1]) {
      x$peaceyrs[i+1] <- x$peaceyrs[i] + 1
    } else{
      x$peaceyrs[i+1] <- x$peaceyrs[i+1]
    }
  }
  return(x)
}

# To apply this code, need to subset the dataset to only those years for which we
# have civil war data (1990-2008):
# First, reorder the dataset:
dataset <- dataset[order(dataset$GMI_ADMIN, dataset$Year),]
row.names(dataset) <- 1:nrow(dataset)
# Then, create subset:
temp_subset <- dataset[dataset$Year < ymd("2009-01-01"), ]
# Then, run function:
temp_subset <- onset_function(temp_subset, "incidence")
# Finally, readd the new variables to the full dataset:
dataset <- join(dataset, temp_subset[, c("GMI_ADMIN", "Year", "cwdur", "onset", 
                                      "onset.n", "peaceyrs")])

# Next, construct conflict measures as well using the wzoneData: Zones of Armed Conflicts dataset:
# Load wzoneData .shp files:

# Load each year's confict .shp file:
wzone_list <- list()
for(i in 1:length(1989:2018)){
  wzone_list[[i]] <- shapefile(paste("yearly_shapefile/wzone_", 1988+i, ".shp", sep = ""))
}

# Then, remove the following conflicts, which are not spatially contained, and which Kyosuke 
# Kikuta advises in his codebook:
# conf_id = 408 (US-led war in Afghanistan)
# conf_id = 506 (IS actions against civilians)
# conf_id = 608 (al-Qaida actions against civilians)

for(i in 1:length(1989:2018)){
  wzone_list[[i]] <- wzone_list[[i]][!(wzone_list[[i]]@data$conf_id %in% c(408, 506, 608)), ]
}

# Next, match each conflict polygon to provinces within the relevant country-year:
# First, match each conflict polygon to provinces:
intersection_list <- list()
for(i in 1:length(1989:2018)){
  # projection(wzone_list[[i]]) <- crs(shape)
  proj4string(wzone_list[[i]]) <- proj4string(shape)
  intersection_list[[i]] <- raster::intersect(shape, wzone_list[[i]])
}

# Then, generate a dataset of provinces affeceted by conflict each year with a
# civil war indicator:
affected_provinces <- list()
for(i in 1:length(1989:2018)){
  # For each year, extract the unique set of affected provinces:
  affected_provinces_temp <- unique(intersection_list[[i]]@data$GMI_ADMIN)
  affected_provinces[[i]] <- as.data.frame(affected_provinces_temp)
  affected_provinces[[i]]$Year <- ymd(paste(1988+i, "-01-01", sep = ""))
  # Add a civil war indicator:
  affected_provinces[[i]]$civil_war_wzone <- 1
  # Finally, adjust the names:
  colnames(affected_provinces[[i]])[1] <- c("GMI_ADMIN")
}

# Now, match this data back to the complete dataset:
# First, combine the list items:
temp <- rbindlist(affected_provinces)
# Then, join the datasets:
dataset <- join(dataset, temp, type = "left")

# Rename civil war variable "incidence":
names(dataset)[names(dataset) == "civil_war_wzone"] <- "incidence_wzone"

# Set NAs to 0s:
dataset$incidence_wzone[is.na(dataset$incidence_wzone)] <- 0

# summary(lm(incidence_wzone ~ incidence, data = dataset))
# summary(lm(incidence_wzone ~ attack, data = dataset))

# Then, generate onset and peace years variables (treating conflict as new if 
# the previous year experienced no conflict):
onset_function_wzone <- function(x, y){ 
  # x == dataset, y == incidence variable, 
  # First, generate civil war duration variable:
  x$cwdur_wzone <- x[, y]
  for(i in 1:(length(x$GMI_ADMIN) - 1)) {
    if(x$cwdur_wzone[i] != 0 & x$cwdur_wzone[i+1] == 1 & x$GMI_ADMIN[i] == x$GMI_ADMIN[i+1]){
      x$cwdur_wzone[i+1] <- x$cwdur_wzone[i] + 1
    } else{
      x$cwdur_wzone[i+1] <- x$cwdur_wzone[i+1]
    }
  }
  # Then, generate onset variable:
  x$onset_wzone <- ifelse(x$cwdur_wzone == 0 | x$cwdur_wzone > 1, 0, 1)
  # Then, correct civil war onset variable by replacing 'incorrect 0's' with NAs:
  x$onset.n_wzone <- ifelse(x[, y] == 1 & x$onset_wzone == 0, NA, x$onset_wzone)
  # Create peace years:
  x$peaceyrs_wzone <- ifelse(x[, y] == 1, 0, 1)
  for(i in 1:(nrow(x) - 1)) {
    if(x$peaceyrs_wzone[i] != 0 & x$peaceyrs_wzone[i+1] == 1 & 
       x$GMI_ADMIN[i] == x$GMI_ADMIN[i+1]) {
      x$peaceyrs_wzone[i+1] <- x$peaceyrs_wzone[i] + 1
    } else{
      x$peaceyrs_wzone[i+1] <- x$peaceyrs_wzone[i+1]
    }
  }
  return(x)
}

# Then, run function:
# dataset <- dataset[order(dataset$GMI_ADMIN, dataset$Year),]
# row.names(dataset) <- 1:nrow(dataset)
dataset <- onset_function_wzone(dataset, "incidence_wzone")

# Values for Syria should be set NAs given that wzone does not cover Syria (except where conflict
# potentially spills over):
dataset[dataset$Country %in% "Syria",
        c("incidence_wzone", "cwdur_wzone", "onset_wzone",
          "onset.n_wzone", "peaceyrs_wzone")] <- NA

# Next, process the camps data:
# First, subset the data to refugee and IDP camps and settlements:
camps_settlements <- locations[locations$loc_type %in% c("IDP Camp", "IDP Settlement",
                                                         "Refugee Camp", "Refugee Settlement"), ]

# Determine whether all camps and settlements are georeferenced:
length(which(is.na(camps_settlements$POINT_X)))
# No. 104 are not
length(which(is.na(camps_settlements$POINT_X))) / nrow(camps_settlements)
# That is 4% of cases. 
# Which are these?
no_long_lat <- camps_settlements[is.na(camps_settlements$POINT_X), ]
sort(table(no_long_lat$Country))
# Burkina Faso                    Niger                    Sudan 
# 1                        2                        8 
# Nigeria Central African Republic 
# 30                       63
sort(table(no_long_lat$loc_type))
# 
# Refugee Camp       IDP Camp IDP Settlement 
# 11             12             81 
# Most are IDP settlements in Central African Republic and Nigeria.

# # Write this dataset out for PVL research assistants to help with:
# write.csv(no_long_lat, "~/Dropbox/Refugees_Conflict_Article/GitHub/refugeeconflict/ResearchAssistance/no_long_lat.csv")

# In the following section, we look closely at the eleven cases of refugee camps 
# without lat-longs to match them with lat-long coordinates:

# First, looking at locations in Sudan:
# Al Redis1 (Albahar)
# Open as of 2019: https://data2.unhcr.org/en/documents/download/71746
# In the "Al Wusta" province of Sudan.
gCentroid(shape[shape@data$CNTRY_NAME == "Sudan" & shape@data$ADMIN_NAME == "Al Wusta", ])
temp_coord_1 <- attributes(gCentroid(shape[shape@data$CNTRY_NAME == "Sudan" & shape@data$ADMIN_NAME == "Al Wusta", ]))$coords[1]
temp_coord_2 <- attributes(gCentroid(shape[shape@data$CNTRY_NAME == "Sudan" & shape@data$ADMIN_NAME == "Al Wusta", ]))$coords[2]
camps_settlements[camps_settlements$name == "Al Redis1 (Albahar)", "POINT_X"] <- temp_coord_1
camps_settlements[camps_settlements$name == "Al Redis1 (Albahar)", "POINT_Y"] <- temp_coord_2
# Al Kashafa in same province as Al Redis1 (Albahar).
camps_settlements[camps_settlements$name == "Al Kashafa", "POINT_X"] <- temp_coord_1
camps_settlements[camps_settlements$name == "Al Kashafa", "POINT_Y"] <- temp_coord_2
# Al Alagaya also in same province:
camps_settlements[camps_settlements$name == "Al Alagaya", "POINT_X"] <- temp_coord_1
camps_settlements[camps_settlements$name == "Al Alagaya", "POINT_Y"] <- temp_coord_2
# Dabat Bosin also in same province:
camps_settlements[camps_settlements$name == "Dabat Bosin", "POINT_X"] <- temp_coord_1
camps_settlements[camps_settlements$name == "Dabat Bosin", "POINT_Y"] <- temp_coord_2
# Al Jameya also in same province:
camps_settlements[camps_settlements$name == "Al Jameya", "POINT_X"] <- temp_coord_1
camps_settlements[camps_settlements$name == "Al Jameya", "POINT_Y"] <- temp_coord_2
# Khor Al Waral also in same province:
camps_settlements[camps_settlements$name == "Khor Al Waral", "POINT_X"] <- temp_coord_1
camps_settlements[camps_settlements$name == "Khor Al Waral", "POINT_Y"] <- temp_coord_2

# Al Nimir and Kario are in the Darfur province:
gCentroid(shape[shape@data$CNTRY_NAME == "Sudan" & shape@data$ADMIN_NAME == "Darfur", ])
temp_coord_1 <- attributes(gCentroid(shape[shape@data$CNTRY_NAME == "Sudan" & shape@data$ADMIN_NAME == "Darfur", ]))$coords[1]
temp_coord_2 <- attributes(gCentroid(shape[shape@data$CNTRY_NAME == "Sudan" & shape@data$ADMIN_NAME == "Darfur", ]))$coords[2]
# Al Nimir also in same province:
camps_settlements[camps_settlements$name == "Al Nimir", "POINT_X"] <- temp_coord_1
camps_settlements[camps_settlements$name == "Al Nimir", "POINT_Y"] <- temp_coord_2
# Kario also in same province:
camps_settlements[camps_settlements$name == "Kario", "POINT_X"] <- temp_coord_1
camps_settlements[camps_settlements$name == "Kario", "POINT_Y"] <- temp_coord_2

# Next, looking at the two cases in Niger:
# For Mangaize and Abala, the coordinates are supplied in another UN document:
# https://reliefweb.int/sites/reliefweb.int/files/resources/131025NGRMangaizeCampProfileOctober2015.pdf
# https://reliefweb.int/sites/reliefweb.int/files/resources/131025NGRabalaCampProfileOct2015.pdf
# Manually enter:
camps_settlements[camps_settlements$name == "Mangaize", "POINT_X"] <- 3.42027778
camps_settlements[camps_settlements$name == "Mangaize", "POINT_Y"] <- 14.68138889
camps_settlements[camps_settlements$name == "Abala", "POINT_X"] <- 3.420278
camps_settlements[camps_settlements$name == "Abala", "POINT_Y"] <- 14.90778

# Next, for the case of Mentao in Burkina Faso:
# Located in the Soum province:
# https://data2.unhcr.org/en/documents/download/33726
gCentroid(shape[shape@data$CNTRY_NAME == "Burkina Faso" & shape@data$ADMIN_NAME == "Soum", ])
temp_coord_1 <- attributes(gCentroid(shape[shape@data$CNTRY_NAME == "Burkina Faso" & shape@data$ADMIN_NAME == "Soum", ]))$coords[1]
temp_coord_2 <- attributes(gCentroid(shape[shape@data$CNTRY_NAME == "Burkina Faso" & shape@data$ADMIN_NAME == "Soum", ]))$coords[2]
camps_settlements[camps_settlements$name == "Mentao", "POINT_X"] <- temp_coord_1
camps_settlements[camps_settlements$name == "Mentao", "POINT_Y"] <- temp_coord_2

# # Evidence that Al Jameya was open in 2018:
# # https://reliefweb.int/report/sudan/sudan-population-operational-update-south-sudanese-refugee-response-1-31-july-2018
# no_long_lat$createdate <- ifelse(no_long_lat$name == "Al Jameya", 
#                                  as.character(ymd("2018-01-01")), as.character(no_long_lat$createdate))
# no_long_lat$createdate <- ymd(no_long_lat$createdate)

# # # Note for paper/records, this assigns missing lat-longs to this set of refugee 
# # # camps but leaves this set of IDP camps unaddressed.

# Match the camps and settlements to provinces:
# First, subset the dataset to only those locations with long-lat coordinates:
camps_settlements <- camps_settlements[!(is.na(camps_settlements$POINT_X)), ]
# Then, extract the long-lat coordinates:
c_s <- camps_settlements[, c("POINT_X", "POINT_Y")]
coordinates(c_s) <- ~POINT_X + POINT_Y
# Project those coordinates to match countries shapefile:
crs(shape) <- crs(utm)
projection(c_s) <- crs(shape)
# Finally, match each camp and settlement to a province:
camps_settlements <- cbind(camps_settlements, over(c_s, shape))

# Then, confirm that that there are no unmatched camps:
length(which(is.na(camps_settlements$GMI_ADMIN)))
# 24 are unmatched. 
# Which are these?
unmatched <- camps_settlements[is.na(camps_settlements$GMI_ADMIN), ]
# A quick review shows that these are coastal locations and the matching process 
# matched the locations with water bodies rather than provinces. Correct this by 
# associating each of the 24 unmatched locations to the nearest province:

unmatched_points <- unmatched[, c("POINT_X", "POINT_Y")]
coordinates(unmatched_points) <- ~POINT_X + POINT_Y
projection(unmatched_points) <- crs(shape)
for(i in 1:nrow(unmatched)){
  crs(unmatched_points) <- crs(utm)
  crs(unmatched_points) <- crs(shape) <- paste("+proj=utm +zone=", 
                                               over(unmatched_points[i,], utm)[2], 
                                               sep = "")
  temp <- which.min(gDistance(unmatched_points[i,], shape, byid=TRUE))
  unmatched[i, colnames(shape@data)] <- 
    shape@data[temp, colnames(shape@data)]
}
# Check that all 24 are now matched:
length(which(is.na(unmatched$GMI_ADMIN)))
# Confirmed.

# Then, replace these entries with the unmatched entries in the camps and settlements
# dataset:
camps_settlements <- rbind(camps_settlements[!(is.na(camps_settlements$GMI_ADMIN)), ], 
                           unmatched)
length(which(is.na(camps_settlements$GMI_ADMIN)))
row.names(camps_settlements) <- 1:nrow(camps_settlements)

# Next, assess whether all of the country names (from the UNHCR website and province 
# .shp file) match. The concern here is that camps/settelemts very close to borders
# might be incorrectly matched to another country:
temp <- camps_settlements[which(!(camps_settlements$Country == camps_settlements$CNTRY_NAME)), 
                          c("Country", "CNTRY_NAME")]

# Correct the following cases:

# 46
# Myanmar
# China
mismatched_point <- camps_settlements[46, c("POINT_X", "POINT_Y")]
coordinates(mismatched_point) <- ~POINT_X + POINT_Y
projection(mismatched_point) <- crs(shape)
crs(mismatched_point) <- crs(utm)
crs(mismatched_point) <- crs(shape) <- paste("+proj=utm +zone=", 
                                             over(mismatched_point, utm)[2], 
                                             sep = "")
temp <- which.min(gDistance(mismatched_point, 
                            shape[!(shape@data$CNTRY_NAME %in% "China"), ], byid=TRUE))
temp2 <- shape[!(shape@data$CNTRY_NAME %in% "China"), ]
camps_settlements[46, colnames(shape@data)] <- temp2@data[temp, colnames(temp2@data)]

# 90
# Turkey
# Syria
mismatched_point <- camps_settlements[90, c("POINT_X", "POINT_Y")]
coordinates(mismatched_point) <- ~POINT_X + POINT_Y
projection(mismatched_point) <- crs(shape)
crs(mismatched_point) <- crs(utm)
crs(mismatched_point) <- crs(shape) <- paste("+proj=utm +zone=", 
                                             over(mismatched_point, utm)[2], 
                                             sep = "")
temp <- which.min(gDistance(mismatched_point, 
                            shape[!(shape@data$CNTRY_NAME %in% "Syria"), ], byid=TRUE))
temp2 <- shape[!(shape@data$CNTRY_NAME %in% "Syria"), ]
camps_settlements[90, colnames(shape@data)] <- temp2@data[temp, colnames(temp2@data)]

# 112
# Myanmar
# China
mismatched_point <- camps_settlements[112, c("POINT_X", "POINT_Y")]
coordinates(mismatched_point) <- ~POINT_X + POINT_Y
projection(mismatched_point) <- crs(shape)
crs(mismatched_point) <- crs(utm)
crs(mismatched_point) <- crs(shape) <- paste("+proj=utm +zone=", 
                                             over(mismatched_point, utm)[2], 
                                             sep = "")
temp <- which.min(gDistance(mismatched_point, 
                            shape[!(shape@data$CNTRY_NAME %in% "China"), ], byid=TRUE))
temp2 <- shape[!(shape@data$CNTRY_NAME %in% "China"), ]
camps_settlements[112, colnames(shape@data)] <- temp2@data[temp, colnames(temp2@data)]

# 114
# Myanmar
# China
mismatched_point <- camps_settlements[114, c("POINT_X", "POINT_Y")]
coordinates(mismatched_point) <- ~POINT_X + POINT_Y
projection(mismatched_point) <- crs(shape)
crs(mismatched_point) <- crs(utm)
crs(mismatched_point) <- crs(shape) <- paste("+proj=utm +zone=", 
                                             over(mismatched_point, utm)[2], 
                                             sep = "")
temp <- which.min(gDistance(mismatched_point, 
                            shape[!(shape@data$CNTRY_NAME %in% "China"), ], byid=TRUE))
temp2 <- shape[!(shape@data$CNTRY_NAME %in% "China"), ]
camps_settlements[114, colnames(shape@data)] <- temp2@data[temp, colnames(temp2@data)]

# 477
# Djibouti
# Ethiopia
mismatched_point <- camps_settlements[477, c("POINT_X", "POINT_Y")]
coordinates(mismatched_point) <- ~POINT_X + POINT_Y
projection(mismatched_point) <- crs(shape)
crs(mismatched_point) <- crs(utm)
crs(mismatched_point) <- crs(shape) <- paste("+proj=utm +zone=", 
                                             over(mismatched_point, utm)[2], 
                                             sep = "")
temp <- which.min(gDistance(mismatched_point, 
                            shape[!(shape@data$CNTRY_NAME %in% "Ethiopia"), ], byid=TRUE))
temp2 <- shape[!(shape@data$CNTRY_NAME %in% "Ethiopia"), ]
camps_settlements[477, colnames(shape@data)] <- temp2@data[temp, colnames(temp2@data)]

# 496
# Ethiopia
# Somalia
mismatched_point <- camps_settlements[496, c("POINT_X", "POINT_Y")]
coordinates(mismatched_point) <- ~POINT_X + POINT_Y
projection(mismatched_point) <- crs(shape)
crs(mismatched_point) <- crs(utm)
crs(mismatched_point) <- crs(shape) <- paste("+proj=utm +zone=", 
                                             over(mismatched_point, utm)[2], 
                                             sep = "")
temp <- which.min(gDistance(mismatched_point, 
                            shape[!(shape@data$CNTRY_NAME %in% "Somalia"), ], byid=TRUE))
temp2 <- shape[!(shape@data$CNTRY_NAME %in% "Somalia"), ]
camps_settlements[496, colnames(shape@data)] <- temp2@data[temp, colnames(temp2@data)]

# 842
# Guinea
# Liberia
mismatched_point <- camps_settlements[842, c("POINT_X", "POINT_Y")]
coordinates(mismatched_point) <- ~POINT_X + POINT_Y
projection(mismatched_point) <- crs(shape)
crs(mismatched_point) <- crs(utm)
crs(mismatched_point) <- crs(shape) <- paste("+proj=utm +zone=", 
                                             over(mismatched_point, utm)[2], 
                                             sep = "")
temp <- which.min(gDistance(mismatched_point, 
                            shape[!(shape@data$CNTRY_NAME %in% "Liberia"), ], byid=TRUE))
temp2 <- shape[!(shape@data$CNTRY_NAME %in% "Liberia"), ]
camps_settlements[842, colnames(shape@data)] <- temp2@data[temp, colnames(temp2@data)]

# 849
# Guinea
# Sierra Leone
mismatched_point <- camps_settlements[849, c("POINT_X", "POINT_Y")]
coordinates(mismatched_point) <- ~POINT_X + POINT_Y
projection(mismatched_point) <- crs(shape)
crs(mismatched_point) <- crs(utm)
crs(mismatched_point) <- crs(shape) <- paste("+proj=utm +zone=", 
                                             over(mismatched_point, utm)[2], 
                                             sep = "")
temp <- which.min(gDistance(mismatched_point, 
                            shape[!(shape@data$CNTRY_NAME %in% "Sierra Leone"), ], byid=TRUE))
temp2 <- shape[!(shape@data$CNTRY_NAME %in% "Sierra Leone"), ]
camps_settlements[849, colnames(shape@data)] <- temp2@data[temp, colnames(temp2@data)]

# 1204
# Turkmenistan
# Uzbekistan
mismatched_point <- camps_settlements[1204, c("POINT_X", "POINT_Y")]
coordinates(mismatched_point) <- ~POINT_X + POINT_Y
projection(mismatched_point) <- crs(shape)
crs(mismatched_point) <- crs(utm)
crs(mismatched_point) <- crs(shape) <- paste("+proj=utm +zone=", 
                                             over(mismatched_point, utm)[2], 
                                             sep = "")
temp <- which.min(gDistance(mismatched_point, 
                            shape[!(shape@data$CNTRY_NAME %in% "Uzbekistan"), ], byid=TRUE))
temp2 <- shape[!(shape@data$CNTRY_NAME %in% "Uzbekistan"), ]
camps_settlements[1204, colnames(shape@data)] <- temp2@data[temp, colnames(temp2@data)]

# 1307
# Papua New Guinea
# Indonesia
mismatched_point <- camps_settlements[1307, c("POINT_X", "POINT_Y")]
coordinates(mismatched_point) <- ~POINT_X + POINT_Y
projection(mismatched_point) <- crs(shape)
crs(mismatched_point) <- crs(utm)
crs(mismatched_point) <- crs(shape) <- paste("+proj=utm +zone=", 
                                             over(mismatched_point, utm)[2], 
                                             sep = "")
temp <- which.min(gDistance(mismatched_point, 
                            shape[!(shape@data$CNTRY_NAME %in% "Indonesia"), ], byid=TRUE))
temp2 <- shape[!(shape@data$CNTRY_NAME %in% "Indonesia"), ]
camps_settlements[1307, colnames(shape@data)] <- temp2@data[temp, colnames(temp2@data)]

# 1336
# Thailand
# Myanmar (Burma)
mismatched_point <- camps_settlements[1336, c("POINT_X", "POINT_Y")]
coordinates(mismatched_point) <- ~POINT_X + POINT_Y
projection(mismatched_point) <- crs(shape)
crs(mismatched_point) <- crs(utm)
crs(mismatched_point) <- crs(shape) <- paste("+proj=utm +zone=", 
                                             over(mismatched_point, utm)[2], 
                                             sep = "")
temp <- which.min(gDistance(mismatched_point, 
                            shape[!(shape@data$CNTRY_NAME %in% "Myanmar (Burma)"), ], byid=TRUE))
temp2 <- shape[!(shape@data$CNTRY_NAME %in% "Myanmar (Burma)"), ]
camps_settlements[1336, colnames(shape@data)] <- temp2@data[temp, colnames(temp2@data)]

# 1385
# Gabon
# Congo
mismatched_point <- camps_settlements[1385, c("POINT_X", "POINT_Y")]
coordinates(mismatched_point) <- ~POINT_X + POINT_Y
projection(mismatched_point) <- crs(shape)
crs(mismatched_point) <- crs(utm)
crs(mismatched_point) <- crs(shape) <- paste("+proj=utm +zone=", 
                                             over(mismatched_point, utm)[2], 
                                             sep = "")
temp <- which.min(gDistance(mismatched_point, 
                            shape[!(shape@data$CNTRY_NAME %in% "Congo"), ], byid=TRUE))
temp2 <- shape[!(shape@data$CNTRY_NAME %in% "Congo"), ]
camps_settlements[1385, colnames(shape@data)] <- temp2@data[temp, colnames(temp2@data)]

# 1473
# Gabon
# Congo
mismatched_point <- camps_settlements[1473, c("POINT_X", "POINT_Y")]
coordinates(mismatched_point) <- ~POINT_X + POINT_Y
projection(mismatched_point) <- crs(shape)
crs(mismatched_point) <- crs(utm)
crs(mismatched_point) <- crs(shape) <- paste("+proj=utm +zone=", 
                                             over(mismatched_point, utm)[2], 
                                             sep = "")
temp <- which.min(gDistance(mismatched_point, 
                            shape[!(shape@data$CNTRY_NAME %in% "Congo"), ], byid=TRUE))
temp2 <- shape[!(shape@data$CNTRY_NAME %in% "Congo"), ]
camps_settlements[1473, colnames(shape@data)] <- temp2@data[temp, colnames(temp2@data)]

# 1641
# Turkey
# Syria
mismatched_point <- camps_settlements[1641, c("POINT_X", "POINT_Y")]
coordinates(mismatched_point) <- ~POINT_X + POINT_Y
projection(mismatched_point) <- crs(shape)
crs(mismatched_point) <- crs(utm)
crs(mismatched_point) <- crs(shape) <- paste("+proj=utm +zone=", 
                                             over(mismatched_point, utm)[2], 
                                             sep = "")
temp <- which.min(gDistance(mismatched_point, 
                            shape[!(shape@data$CNTRY_NAME %in% "Syria"), ], byid=TRUE))
temp2 <- shape[!(shape@data$CNTRY_NAME %in% "Syria"), ]
camps_settlements[1641, colnames(shape@data)] <- temp2@data[temp, colnames(temp2@data)]

# 1818
# Malawi
# Mozambique
mismatched_point <- camps_settlements[1818, c("POINT_X", "POINT_Y")]
coordinates(mismatched_point) <- ~POINT_X + POINT_Y
projection(mismatched_point) <- crs(shape)
crs(mismatched_point) <- crs(utm)
crs(mismatched_point) <- crs(shape) <- paste("+proj=utm +zone=", 
                                             over(mismatched_point, utm)[2], 
                                             sep = "")
temp <- which.min(gDistance(mismatched_point, 
                            shape[!(shape@data$CNTRY_NAME %in% "Mozambique"), ], byid=TRUE))
temp2 <- shape[!(shape@data$CNTRY_NAME %in% "Mozambique"), ]
camps_settlements[1818, colnames(shape@data)] <- temp2@data[temp, colnames(temp2@data)]

# 1823
# Malawi
# Mozambique
mismatched_point <- camps_settlements[1823, c("POINT_X", "POINT_Y")]
coordinates(mismatched_point) <- ~POINT_X + POINT_Y
projection(mismatched_point) <- crs(shape)
crs(mismatched_point) <- crs(utm)
crs(mismatched_point) <- crs(shape) <- paste("+proj=utm +zone=", 
                                             over(mismatched_point, utm)[2], 
                                             sep = "")
temp <- which.min(gDistance(mismatched_point, 
                            shape[!(shape@data$CNTRY_NAME %in% "Mozambique"), ], byid=TRUE))
temp2 <- shape[!(shape@data$CNTRY_NAME %in% "Mozambique"), ]
camps_settlements[1823, colnames(shape@data)] <- temp2@data[temp, colnames(temp2@data)]

# 2073
# Myanmar
# China
mismatched_point <- camps_settlements[2073, c("POINT_X", "POINT_Y")]
coordinates(mismatched_point) <- ~POINT_X + POINT_Y
projection(mismatched_point) <- crs(shape)
crs(mismatched_point) <- crs(utm)
crs(mismatched_point) <- crs(shape) <- paste("+proj=utm +zone=", 
                                             over(mismatched_point, utm)[2], 
                                             sep = "")
temp <- which.min(gDistance(mismatched_point, 
                            shape[!(shape@data$CNTRY_NAME %in% "China"), ], byid=TRUE))
temp2 <- shape[!(shape@data$CNTRY_NAME %in% "China"), ]
camps_settlements[2073, colnames(shape@data)] <- temp2@data[temp, colnames(temp2@data)]

# 2084
# Myanmar
# China
mismatched_point <- camps_settlements[2084, c("POINT_X", "POINT_Y")]
coordinates(mismatched_point) <- ~POINT_X + POINT_Y
projection(mismatched_point) <- crs(shape)
crs(mismatched_point) <- crs(utm)
crs(mismatched_point) <- crs(shape) <- paste("+proj=utm +zone=", 
                                             over(mismatched_point, utm)[2], 
                                             sep = "")
temp <- which.min(gDistance(mismatched_point, 
                            shape[!(shape@data$CNTRY_NAME %in% "China"), ], byid=TRUE))
temp2 <- shape[!(shape@data$CNTRY_NAME %in% "China"), ]
camps_settlements[2084, colnames(shape@data)] <- temp2@data[temp, colnames(temp2@data)]

# Check again:
temp <- camps_settlements[which(!(camps_settlements$Country == camps_settlements$CNTRY_NAME)), 
                          c("Country", "CNTRY_NAME")]

# Next, explore the open, close and update dates:
# First, convert dates:
camps_settlements$createdate <- ymd(camps_settlements$createdate)
camps_settlements$closedate <- ymd(camps_settlements$closedate)
# For update date, separate date from time stamp:
camps_settlements$updatedate <- str_split_fixed(camps_settlements$updatedate, " ", 2)[, 1]
# Then, convert to date format:
camps_settlements$updatedate <- ymd(camps_settlements$updatedate)

# Do the date ranges look reasonable?
summary(camps_settlements$createdate)
summary(camps_settlements$updatedate)
summary(camps_settlements$closedate)
# Yes.

# Now, how many observations have no date (across open, update, and close)?
nrow(camps_settlements[is.na(camps_settlements$createdate) & 
                         is.na(camps_settlements$updatedate) & 
                         is.na(camps_settlements$closedate), ])
# None.

# Then, assess how many cases have only one date:
nrow(camps_settlements[(is.na(camps_settlements$createdate) & 
                         is.na(camps_settlements$updatedate)) |
                         (is.na(camps_settlements$createdate) & 
                            is.na(camps_settlements$closedate)) |
                         (is.na(camps_settlements$updatedate) & 
                            is.na(camps_settlements$closedate)), ])
# 168 cases.
# Which?
one_date <- camps_settlements[(is.na(camps_settlements$createdate) & 
                                 is.na(camps_settlements$updatedate)) |
                                (is.na(camps_settlements$createdate) & 
                                   is.na(camps_settlements$closedate)) |
                                (is.na(camps_settlements$updatedate) & 
                                   is.na(camps_settlements$closedate)), ]

# Some observations --
# For all of these locations, only update dates are listed.
# Nearly all cases are from Uganda and are IDP camps; although, there are a non-trivial 
# number of cases from Turkey as well. 
# Virtually all of the Uganda cases are associated with province UGA-NRT. 

# Given that our camp and settlement measures are binary and take a value
# of 1 if one or more camps are present in a given province-year, for these
# IDPs camps, we need only establish that at least one camp was present per
# year. 

# First, starting with Nigeria:

# Anaka -- "The camp was one of the first to be created in Northern Uganda,
# way back in 1996" (https://reliefweb.int/sites/reliefweb.int/files/resources/93AA7EAD7E739FF4C125705A00412A1E-globalidp-uga-10Aug.pdf)
# Still open as of July 2005: https://www.who.int/hac/crises/uga/sitreps/Ugandamortsurvey.pdf

# Appears that at least some of camps remained open until the end of 2011 --

# "After the peace agreement, the process of resettlement began, 
# led by the government through its peace, recovery and development plan 
# (PRDP), launched in 2007 and implemented by the UNHCR. Last month [THAT WOULD BE 
# DEC 2011, BASED ON ARTICLE DATE], the UN officially ended its assistance 
# in northern Uganda. Some 247 camps have been closed and, according to the UN, 
# 98% of IDPs have since left, returning home or moving elsewhere."
# https://www.theguardian.com/global-development/poverty-matters/2012/jan/24/northern-uganda-displaced-people-out-in-cold

camps_settlements$createdate <- ifelse(camps_settlements$GMI_ADMIN == "UGA-NRT" & 
                             camps_settlements$loc_type == "IDP Camp", 
                           as.character(ymd("1996-01-01")), as.character(camps_settlements$createdate))
camps_settlements$createdate <- ymd(camps_settlements$createdate)

camps_settlements$closedate <- ifelse(camps_settlements$GMI_ADMIN == "UGA-NRT" & 
                            camps_settlements$loc_type == "IDP Camp", 
                          as.character(ymd("2011-01-01")), as.character(camps_settlements$closedate))
camps_settlements$closedate <- ymd(camps_settlements$closedate)

# Then, working on Ethiopia:
# Open camps in Ethiopia: Adi Harush, Melkadida, Helaweyn --

# Adi Harush opened in 2010 and still open as of January 2018:
# https://data2.unhcr.org/en/documents/download/62692

camps_settlements$createdate <- ifelse(camps_settlements$name == "Adi Harush", as.character(ymd("2010-01-01")), as.character(camps_settlements$createdate))
camps_settlements$createdate <- ymd(camps_settlements$createdate)

# Melkadida opened in February of 2010 (https://data2.unhcr.org/en/documents/download/31820)
# and remained open as of July 2018 (https://data2.unhcr.org/en/documents/download/66709).

camps_settlements$createdate <- ifelse(camps_settlements$name == "Melkadida", as.character(ymd("2010-02-01")), as.character(camps_settlements$createdate))
camps_settlements$createdate <- ymd(camps_settlements$createdate)

# Helaweyn opened in 2011 and remained open as of July 2018:
# https://data2.unhcr.org/en/documents/download/66708

camps_settlements$createdate <- ifelse(camps_settlements$name == "Helaweyn", as.character(ymd("2011-01-01")), as.character(camps_settlements$createdate))
camps_settlements$createdate <- ymd(camps_settlements$createdate)

# Next, we deal with the cases in Turkey:
# Yayladagi
# Camp appears to open in 2011: 
# https://www.unhcr.org/58a6bbca7.pdf
# https://unitar.org/unosat/node/44/1655
# https://reliefweb.int/map/turkey/syrian-refugee-camps-yayladagi-yayladagi-district-hatay-province-turkey-4-april-2012
camps_settlements$createdate <- ifelse(camps_settlements$Country == "Turkey" & camps_settlements$name == "Yayladagi", 
                                       as.character(ymd("2011-01-01")), as.character(camps_settlements$createdate))
camps_settlements$createdate <- ymd(camps_settlements$createdate)

# Altinozu
# Camp appears to open in 2011: 
# https://www.washingtonpost.com/blogs/celebritology/post/angelina-jolie-visits-syrian-refugees-photos/2011/06/17/AGUE34YH_story.html
# https://reliefweb.int/sites/reliefweb.int/files/resources/UNHCR_Syrian-Refugee-Camps-and-Provincial-Breakdown-of-Syrian-Refugees-Registered-in-South-East-Turkey-November-2018-01.pdf
camps_settlements$createdate <- ifelse(camps_settlements$Country == "Turkey" & camps_settlements$name == "Altinozu", 
                                       as.character(ymd("2011-01-01")), as.character(camps_settlements$createdate))
camps_settlements$createdate <- ymd(camps_settlements$createdate)

# Apaydin
# https://reliefweb.int/map/turkey/syrian-refugee-camps-apaydin-merkez-district-hatay-province-turkey-30-march-2012
camps_settlements$createdate <- ifelse(camps_settlements$Country == "Turkey" & camps_settlements$name == "Apaydin", 
                                       as.character(ymd("2011-01-01")), as.character(camps_settlements$createdate))
camps_settlements$createdate <- ymd(camps_settlements$createdate)

# Nizip-1
# https://multimedia.europarl.europa.eu/en/meps-visit-nizip-refugee-camp-in-turkey_N005-160209-001_ev
# https://reliefweb.int/report/turkey/turkey-time-syrian-refugees
camps_settlements$createdate <- ifelse(camps_settlements$Country == "Turkey" & camps_settlements$name == "Nizip-1", 
                                       as.character(ymd("2012-01-01")), as.character(camps_settlements$createdate))
camps_settlements$createdate <- ymd(camps_settlements$createdate)

# Karkamis
# http://c4sr.columbia.edu/conflict-urbanism-aleppo/seminar/Case-Studies/Refugee-Camps-as-Replacement-Urbanism/Refugee-Camps-as-Replacement-Urbanism-Darcey.html#Text8
# https://reliefweb.int/sites/reliefweb.int/files/resources/67208.pdf
camps_settlements$createdate <- ifelse(camps_settlements$Country == "Turkey" & camps_settlements$name == "Karkamis", 
                                       as.character(ymd("2012-01-01")), as.character(camps_settlements$createdate))
camps_settlements$createdate <- ymd(camps_settlements$createdate)

# Akcakale
# https://ahvalnews.com/refugees/turkey-empties-syrian-border-refugee-camp
camps_settlements$createdate <- ifelse(camps_settlements$Country == "Turkey" & camps_settlements$name == "Akcakale", 
                                       as.character(ymd("2012-01-01")), as.character(camps_settlements$createdate))
camps_settlements$createdate <- ymd(camps_settlements$createdate)

# Merkez
# https://unitar.org/unosat/node/44/1653
# https://reliefweb.int/sites/reliefweb.int/files/resources/67208.pdf
camps_settlements$createdate <- ifelse(camps_settlements$Country == "Turkey" & camps_settlements$name == "Merkez", 
                                       as.character(ymd("2011-01-01")), as.character(camps_settlements$createdate))
camps_settlements$createdate <- ymd(camps_settlements$createdate)

# Islahiye
# https://www.un.org/sg/en/content/sg/press-encounter/2012-12-07/secretary-generals-press-encounter-islahiye-refugee-camp; https://reliefweb.int/sites/reliefweb.int/files/resources/67208.pdf
camps_settlements$createdate <- ifelse(camps_settlements$Country == "Turkey" & camps_settlements$name == "Islahiye", 
                                       as.character(ymd("2012-01-01")), as.character(camps_settlements$createdate))
camps_settlements$createdate <- ymd(camps_settlements$createdate)

# Nizip-2 
# https://reliefweb.int/report/turkey/turkeys-approach-syrian-refugees-sustainable
# Article references being open "recently" in 2013. 
camps_settlements$createdate <- ifelse(camps_settlements$Country == "Turkey" & camps_settlements$name == "Nizip-2", 
                                       as.character(ymd("2013-01-01")), as.character(camps_settlements$createdate))
camps_settlements$createdate <- ymd(camps_settlements$createdate)

# Oncupinar 
# https://www.worldbulletin.net/general/turkish-refugee-camp-nicest-in-the-world-h128915.html
# https://reliefweb.int/sites/reliefweb.int/files/resources/UNHCR_Syrian-Refugee-Camps-and-Provincial-Breakdown-of-Syrian-Refugees-Registered-in-South-East-Turkey-November-2018-01.pdf
camps_settlements$createdate <- ifelse(camps_settlements$Country == "Turkey" & camps_settlements$name == "Oncupinar", 
                                       as.character(ymd("2012-01-01")), as.character(camps_settlements$createdate))
camps_settlements$createdate <- ymd(camps_settlements$createdate)

# Elbeyli 
# https://data2.unhcr.org/ar/documents/download/38166
# https://reliefweb.int/sites/reliefweb.int/files/resources/UNHCR_Syrian-Refugee-Camps-and-Provincial-Breakdown-of-Syrian-Refugees-Registered-in-South-East-Turkey-November-2018-01.pdf
camps_settlements$createdate <- ifelse(camps_settlements$Country == "Turkey" & camps_settlements$name == "Elbeyli", 
                                       as.character(ymd("2013-01-01")), as.character(camps_settlements$createdate))
camps_settlements$createdate <- ymd(camps_settlements$createdate)

# Midyat 
# https://reliefweb.int/report/turkey/unhcr-turkey-syrian-refugee-daily-sitrep-9-july-2013
# https://reliefweb.int/sites/reliefweb.int/files/resources/UNHCR_Syrian-Refugee-Camps-and-Provincial-Breakdown-of-Syrian-Refugees-Registered-in-South-East-Turkey-November-2018-01.pdf
camps_settlements$createdate <- ifelse(camps_settlements$Country == "Turkey" & camps_settlements$name == "Midyat", 
                                       as.character(ymd("2013-01-01")), as.character(camps_settlements$createdate))
camps_settlements$createdate <- ymd(camps_settlements$createdate)

# Ceylanpinar 
# https://www.nytimes.com/2012/07/31/world/middleeast/syrian-refugees-escape-to-a-barren-plain-of-sweat-and-grit-in-turkey.html
# https://reliefweb.int/sites/reliefweb.int/files/resources/UNHCR_Syrian-Refugee-Camps-and-Provincial-Breakdown-of-Syrian-Refugees-Registered-in-South-East-Turkey-November-2018-01.pdf
camps_settlements$createdate <- ifelse(camps_settlements$Country == "Turkey" & camps_settlements$name == "Ceylanpinar", 
                                       as.character(ymd("2012-01-01")), as.character(camps_settlements$createdate))
camps_settlements$createdate <- ymd(camps_settlements$createdate)

# Harran 
# https://data2.unhcr.org/en/documents/download/38547
# https://reliefweb.int/sites/reliefweb.int/files/resources/UNHCR_Syrian-Refugee-Camps-and-Provincial-Breakdown-of-Syrian-Refugees-Registered-in-South-East-Turkey-November-2018-01.pdf
camps_settlements$createdate <- ifelse(camps_settlements$Country == "Turkey" & camps_settlements$name == "Harran", 
                                       as.character(ymd("2013-01-01")), as.character(camps_settlements$createdate))
camps_settlements$createdate <- ymd(camps_settlements$createdate)

# Yayladagi-1
# Cannot distinguish between Yayladagi and Yayladagi-1. However, we note that they are located in the same area. Therefore,
# assigning Yayladagi's create date to Yayladagi-1:
camps_settlements$createdate <- ifelse(camps_settlements$Country == "Turkey" & camps_settlements$name == "Yayladagi-1", 
                                       as.character(ymd("2011-01-01")), as.character(camps_settlements$createdate))
camps_settlements$createdate <- ymd(camps_settlements$createdate)

# Saricam
# https://www.brookings.edu/wp-content/uploads/2016/06/Turkey-and-Syrian-Refugees_The-Limits-of-Hospitality-2014.pdf
# https://data2.unhcr.org/en/documents/download/37950
# https://reliefweb.int/sites/reliefweb.int/files/resources/UNHCR_Syrian-Refugee-Camps-and-Provincial-Breakdown-of-Syrian-Refugees-Registered-in-South-East-Turkey-November-2018-01.pdf
camps_settlements$createdate <- ifelse(camps_settlements$Country == "Turkey" & camps_settlements$name == "Saricam", 
                                       as.character(ymd("2013-01-01")), as.character(camps_settlements$createdate))
camps_settlements$createdate <- ymd(camps_settlements$createdate)

# Next, for Uganda:
# Lolim
# https://www.who.int/hac/crises/uga/sitreps/Ugandamortsurvey.pdf
camps_settlements$createdate <- ifelse(camps_settlements$Country == "Uganda" & camps_settlements$name == "Lolim", 
                                       as.character(ymd("2005-01-01")), as.character(camps_settlements$createdate))
camps_settlements$createdate <- ymd(camps_settlements$createdate)

# The, for Iran:
# Taft
# https://reliefweb.int/sites/reliefweb.int/files/resources/wfp_iran_evaluation.pdf
camps_settlements$createdate <- ifelse(camps_settlements$Country == "Islamic Republic of Iran" & camps_settlements$name == "Taft", 
                                       as.character(ymd("2013-01-01")), as.character(camps_settlements$createdate))
camps_settlements$createdate <- ymd(camps_settlements$createdate)

# Meybod
# https://reliefweb.int/sites/reliefweb.int/files/resources/wfp_iran_evaluation.pdf; https://www.unhcr.org/ir/sub-office-kerman/
camps_settlements$createdate <- ifelse(camps_settlements$Country == "Islamic Republic of Iran" & camps_settlements$name == "Meybod", 
                                       as.character(ymd("2013-01-01")), as.character(camps_settlements$createdate))
camps_settlements$createdate <- ymd(camps_settlements$createdate)

# Ardakan
# https://www.unhcr.org/539ab62a9.pdf; https://www.unhcr.org/ir/sub-office-kerman/
camps_settlements$createdate <- ifelse(camps_settlements$Country == "Islamic Republic of Iran" & camps_settlements$name == "Ardakan", 
                                       as.character(ymd("2015-01-01")), as.character(camps_settlements$createdate))
camps_settlements$createdate <- ymd(camps_settlements$createdate)

# Then, for Cameroon:
# Yokadouma
# https://www.unhcr.org/53cf62449.pdf
camps_settlements$createdate <- ifelse(camps_settlements$Country == "Cameroon" & camps_settlements$name == "Yokadouma", 
                                       as.character(ymd("2014-01-01")), as.character(camps_settlements$createdate))
camps_settlements$createdate <- ymd(camps_settlements$createdate)

# Ngari-singo
# https://www.humanitarianresponse.info/sites/www.humanitarianresponse.info/files/documents/files/profildesitengarinsingo_octobre2016_final.pdf
# https://data2.unhcr.org/en/documents/details/62535
camps_settlements$createdate <- ifelse(camps_settlements$Country == "Cameroon" & camps_settlements$name == "Ngari-singo", 
                                       as.character(ymd("2014-04-24")), as.character(camps_settlements$createdate))
camps_settlements$createdate <- ymd(camps_settlements$createdate)

# # # Note for paper/records, this assigns missing open/close dates to this set of 
# # # refugee camps but leaves this set of IDP camps unaddressed.

# Now, if close date is "NA", set close date to last possible date (2019); we use 2019
# rather than 2018, the last year of our study period, for the purposes of running
# placebo tests:
camps_settlements$closedate <- ifelse(is.na(camps_settlements$closedate), as.character("2019-12-31"), as.character(ymd(camps_settlements$closedate)))
camps_settlements$closedate <- ymd(camps_settlements$closedate)

# Now, given that the update date carries little useful information for establishing
# open and close dates --
summary(camps_settlements$updatedate) 
# -- are there any remaining cases with no create date?
no_create <- camps_settlements[(is.na(camps_settlements$createdate)), ]
# Two remaining cases:
table(no_create$name)
# Altinozu-1 and Duzici in Turkey.
# For Altinozu-1, the following State Department map shows that it was open as early
# as 2013: 
# https://reliefweb.int/sites/reliefweb.int/files/resources/Syria%20Numbers%20and%20Locations%20of%20Refugees%20and%20IDPs%20July%202013.pdf
camps_settlements$createdate <- ifelse(camps_settlements$Country == "Turkey" & camps_settlements$name == "Altinozu-1", 
                                       as.character(ymd("2013-01-01")), as.character(camps_settlements$createdate))
camps_settlements$createdate <- ymd(camps_settlements$createdate)

# The first reference we were able to find showing individuals at Duzici is 2013:
# https://data2.unhcr.org/ar/documents/download/36990
camps_settlements$createdate <- ifelse(camps_settlements$Country == "Turkey" & camps_settlements$name == "Duzici", 
                                       as.character(ymd("2013-01-01")), as.character(camps_settlements$createdate))
camps_settlements$createdate <- ymd(camps_settlements$createdate)

# Now, check which cases involve:
# Close dates listed before create dates:
camps_settlements$close_before_create <- camps_settlements$createdate > camps_settlements$closedate
# Update dates before create dates:
camps_settlements$update_before_create <- camps_settlements$createdate > camps_settlements$updatedate

check1 <- camps_settlements[camps_settlements$close_before_create %in% "TRUE", ]
check2 <- camps_settlements[camps_settlements$update_before_create %in% "TRUE", ]

# 242 cases in which close date comes before create date.
# No cases of update date coming before create date.

# # Export for lab to analyze:
# write.csv(check1, "~/Dropbox/Refugees_Conflict_Article/GitHub/refugeeconflict/ResearchAssistance/close_before_create.csv")

# First, dealing with the IDP camps in Chad's Ouaddai province:
# According to the UN locations dataset, all three locations close in 2004 and 
# open in 2011. Yet, all three are apparently active in the many of the years in between. 
# One possibility is that the dates are simply swtiched. Another is that the are simply 
# wrong. Given that we cannot determine directly, we instead look at the span of years
# when the camps were open based on reports found online. 

# Generally, it appears that conflict in Darfur spilled over into Chad in 2006.
# https://www.hrw.org/news/2006/02/21/chad-darfur-conflict-spills-across-border#
# https://www.hrw.org/legacy/backgrounder/africa/chad0206/3.htm

# Alacha -- 
# Evidence of Alacha being open as early as 2007 --
# https://www.refworld.org/pdfid/46811f3b2.pdf --
# and remaining open from 2008 through 2012. See:
# Open in 2008:
# https://reliefweb.int/report/chad/humanitarian-needs-and-response-chad-weekly-information-bulletin-16-apr-2008
# Open in 2009:
# https://www.unhcr.org/4ce5327f9.pdf
# Open from 2007-2010:
# https://www.internal-displacement.org/sites/default/files/publications/documents/201011-af-chad-national-outrage-chad-violence-against-internally-displaced-women-and-girls-in-eastern-chad-country-en_0.pdf
# Open in 2011:
# https://www.internal-displacement.org/sites/default/files/publications/documents/201106-af-chad-overview-en.pdf
# Open in 2012:
# https://reliefweb.int/sites/reliefweb.int/files/resources/map_1981.pdf

# Goundiang -- 
# Evidence of being open in 2007 and 2008:
# https://www.refworld.org/pdfid/4b45ac0c2.pdf
# And in 2010:
# https://reliefweb.int/report/chad/chad-response-food-crisis-sahel-premi%C3%A8re-urgence-helping-25000-displaced-native-refugees
# And in 2011:
# https://www.alwihdainfo.com/attachment/258931/

# Mourouske (very limited information online) -- 
# Evidence of being open in 2007:
# https://www.refworld.org/pdfid/46cda3a10.pdf

camps_settlements$createdate <- ifelse(camps_settlements$name %in% "Alacha", 
                                       as.character("2007-01-01"), as.character(ymd(camps_settlements$createdate)))
camps_settlements$createdate <- ymd(camps_settlements$createdate)
camps_settlements$createdate <- ifelse(camps_settlements$name %in% "Goundiang", 
                                       as.character("2008-01-01"), as.character(ymd(camps_settlements$createdate)))
camps_settlements$createdate <- ymd(camps_settlements$createdate)
camps_settlements$createdate <- ifelse(camps_settlements$name %in% "Mourouske", 
                                       as.character("2007-01-01"), as.character(ymd(camps_settlements$createdate)))
camps_settlements$createdate <- ymd(camps_settlements$createdate)

camps_settlements$closedate <- ifelse(camps_settlements$name %in% "Alacha", 
                                       as.character("2012-01-01"), as.character(ymd(camps_settlements$closedate)))
camps_settlements$closedate <- ymd(camps_settlements$closedate)
camps_settlements$closedate <- ifelse(camps_settlements$name %in% "Goundiang", 
                                       as.character("2011-01-01"), as.character(ymd(camps_settlements$closedate)))
camps_settlements$closedate <- ymd(camps_settlements$closedate)
camps_settlements$closedate <- ifelse(camps_settlements$name %in% "Mourouske", 
                                       as.character("2007-01-01"), as.character(ymd(camps_settlements$closedate)))
camps_settlements$closedate <- ymd(camps_settlements$closedate)

# Next, given that many of these camps and settlements are in a single province, 
# instead of correcting the individual camp/settlement data, for 
# those provinces in question, we identify all years when at least one refugee 
# camp or settlement was open, and then assign this date range to all relevant locations:

# Open province timeline document for this exercise:
province_timelines <- read.csv("Province_Timelines_PVL.csv", header = T)

# For the dates used in the following corrections, see the Province_Timelines_PVL.csv
# datasheet.

# "Trebil (no man's land)" refugee camp in Jordan's Al Mafraq province:
camps_settlements$createdate <- ifelse(camps_settlements$name %in% c("Trebil (no man's land)"), 
                                       as.character("2006-01-01"), as.character(ymd(camps_settlements$createdate)))
camps_settlements$createdate <- ymd(camps_settlements$createdate)
camps_settlements$closedate <- ifelse(camps_settlements$name %in% c("Trebil (no man's land)"), 
                                       as.character("2009-01-01"), as.character(ymd(camps_settlements$closedate)))
camps_settlements$closedate <- ymd(camps_settlements$closedate)

# Dehema and Salmin refugee camps in Sudan's Ash Sharqiyah province:
camps_settlements$createdate <- ifelse(camps_settlements$name %in% c("Dehema"), 
                                       as.character("1969-01-01"), as.character(ymd(camps_settlements$createdate)))
camps_settlements$createdate <- ymd(camps_settlements$createdate)
camps_settlements$closedate <- ifelse(camps_settlements$name %in% c("Dehema"), 
                                      as.character("2000-01-01"), as.character(ymd(camps_settlements$closedate)))
camps_settlements$closedate <- ymd(camps_settlements$closedate)

camps_settlements$createdate <- ifelse(camps_settlements$name %in% c("Salmin"), 
                                       as.character("1969-01-01"), as.character(ymd(camps_settlements$createdate)))
camps_settlements$createdate <- ymd(camps_settlements$createdate)
camps_settlements$closedate <- ifelse(camps_settlements$name %in% c("Salmin"), 
                                      as.character("2000-01-01"), as.character(ymd(camps_settlements$closedate)))
camps_settlements$closedate <- ymd(camps_settlements$closedate)

# "UAE Red Crescent" refugee camp in Pakistan's Baluchistan province:
camps_settlements$createdate <- ifelse(camps_settlements$name %in% c("UAE Red Crescent"), 
                                       as.character("2004-01-01"), as.character(ymd(camps_settlements$createdate)))
camps_settlements$createdate <- ymd(camps_settlements$createdate)
camps_settlements$closedate <- ifelse(camps_settlements$name %in% c("UAE Red Crescent"), 
                                      as.character("2008-01-01"), as.character(ymd(camps_settlements$closedate)))
camps_settlements$closedate <- ymd(camps_settlements$closedate)

# "Boké" refugee camp in Guinea's Boke province:
camps_settlements$createdate <- ifelse(camps_settlements$name %in% c("Boké"), 
                                       as.character("1998-01-01"), as.character(ymd(camps_settlements$createdate)))
camps_settlements$createdate <- ymd(camps_settlements$createdate)
camps_settlements$closedate <- ifelse(camps_settlements$name %in% c("Boké"), 
                                      as.character("1999-01-01"), as.character(ymd(camps_settlements$closedate)))
camps_settlements$closedate <- ymd(camps_settlements$closedate)

# "SARAJEVO-RAJLOVAC" refugee camp in Bosnia and Herzegovina:
camps_settlements$createdate <- ifelse(camps_settlements$name %in% c("SARAJEVO-RAJLOVAC"), 
                                       as.character("1999-01-01"), as.character(ymd(camps_settlements$createdate)))
camps_settlements$createdate <- ymd(camps_settlements$createdate)
camps_settlements$closedate <- ifelse(camps_settlements$name %in% c("SARAJEVO-RAJLOVAC"), 
                                      as.character("2000-01-01"), as.character(ymd(camps_settlements$closedate)))
camps_settlements$closedate <- ymd(camps_settlements$closedate)

# Next, for the significant number of refugee settlements listed in Mexico's Chiapas 
# province, discussions with Professor Óscar F. Gil-García, Department of Human Development,
# Binghamton University, indicate that these camps never ceased to exist, and date back 
# to at least the first year of our study period:
camps_settlements$createdate <- ifelse(camps_settlements$ADMIN_NAME %in% c("Chiapas") & 
                                         camps_settlements$loc_type == "Refugee Settlement", 
                                       as.character("1990-01-01"), as.character(ymd(camps_settlements$createdate)))
camps_settlements$createdate <- ymd(camps_settlements$createdate)
camps_settlements$closedate <- ifelse(camps_settlements$ADMIN_NAME %in% c("Chiapas") & 
                                        camps_settlements$loc_type == "Refugee Settlement", 
                                      as.character("2018-01-01"), as.character(ymd(camps_settlements$closedate)))
camps_settlements$closedate <- ymd(camps_settlements$closedate)

# "Cishemeye I" and "Cishemeye II" refugee settlements of Burundi's Cibitoke province:
camps_settlements$createdate <- ifelse(camps_settlements$name %in% c("Cishemeye I", "Cishemeye II"), 
                                       as.character("2002-01-01"), as.character(ymd(camps_settlements$createdate)))
camps_settlements$createdate <- ymd(camps_settlements$createdate)
camps_settlements$closedate <- ifelse(camps_settlements$name %in% c("Cishemeye I", "Cishemeye II"), 
                                      as.character("2004-01-01"), as.character(ymd(camps_settlements$closedate)))
camps_settlements$closedate <- ymd(camps_settlements$closedate)

# "Aour-Aoussa" refugee camp in Djibouti's Dikhil province:
camps_settlements$createdate <- ifelse(camps_settlements$name %in% c("Aour-Aoussa"), 
                                       as.character("2003-01-01"), as.character(ymd(camps_settlements$createdate)))
camps_settlements$createdate <- ymd(camps_settlements$createdate)
camps_settlements$closedate <- ifelse(camps_settlements$name %in% c("Aour-Aoussa"), 
                                      as.character("2005-01-01"), as.character(ymd(camps_settlements$closedate)))
camps_settlements$closedate <- ymd(camps_settlements$closedate)

# We were unable to find any information about the "Santa Amalia" refugee settlement in 
# Mexico's Distrito Federal. The create date is currently listed as 2000-11-08 and the close
# date as 2000-01-01. Therefore, assigning the general year of 2000 to both:
camps_settlements$createdate <- ifelse(camps_settlements$name %in% c("Santa Amalia"), 
                                       as.character("2000-01-01"), as.character(ymd(camps_settlements$createdate)))
camps_settlements$createdate <- ymd(camps_settlements$createdate)
camps_settlements$closedate <- ifelse(camps_settlements$name %in% c("Santa Amalia"), 
                                      as.character("2000-01-01"), as.character(ymd(camps_settlements$closedate)))
camps_settlements$closedate <- ymd(camps_settlements$closedate)

# Next, dealing with the large number of refugee camps in Guinea's Forecariah province:
camps_settlements$createdate <- ifelse(camps_settlements$ADMIN_NAME %in% c("Forecariah") & 
                                         camps_settlements$loc_type == "Refugee Camp", 
                                       as.character("1998-01-01"), as.character(ymd(camps_settlements$createdate)))
camps_settlements$createdate <- ymd(camps_settlements$createdate)
camps_settlements$closedate <- ifelse(camps_settlements$ADMIN_NAME %in% c("Forecariah") & 
                                        camps_settlements$loc_type == "Refugee Camp", 
                                      as.character("2002-01-01"), as.character(ymd(camps_settlements$closedate)))
camps_settlements$closedate <- ymd(camps_settlements$closedate)

# Then, dealing with the large number of refugee camps in Guinea's Gueckedou province:
camps_settlements$createdate <- ifelse(camps_settlements$ADMIN_NAME %in% c("Gueckedou") & 
                                         camps_settlements$loc_type == "Refugee Camp", 
                                       as.character("1997-01-01"), as.character(ymd(camps_settlements$createdate)))
camps_settlements$createdate <- ymd(camps_settlements$createdate)
camps_settlements$closedate <- ifelse(camps_settlements$ADMIN_NAME %in% c("Gueckedou") & 
                                        camps_settlements$loc_type == "Refugee Camp", 
                                      as.character("2003-01-01"), as.character(ymd(camps_settlements$closedate)))
camps_settlements$closedate <- ymd(camps_settlements$closedate)

# Then, dealing with the various refugee camps (and one settlement) in the 
# Democratic Republic of the Congo's Haut-Zaire province:
camps_settlements$createdate <- ifelse(camps_settlements$ADMIN_NAME %in% c("Haut-Zaire") & 
                                         camps_settlements$loc_type %in% c("Refugee Camp", "Refugee Settlement"), 
                                       as.character("2000-01-01"), as.character(ymd(camps_settlements$createdate)))
camps_settlements$createdate <- ymd(camps_settlements$createdate)
camps_settlements$closedate <- ifelse(camps_settlements$ADMIN_NAME %in% c("Haut-Zaire") & 
                                        camps_settlements$loc_type %in% c("Refugee Camp", "Refugee Settlement"), 
                                      as.character("2006-01-01"), as.character(ymd(camps_settlements$closedate)))
camps_settlements$closedate <- ymd(camps_settlements$closedate)

# "Kinkole" refugee camp in the Democratic Republic of the Congo's Kinshasa province:
camps_settlements$createdate <- ifelse(camps_settlements$name %in% c("Kinkole"), 
                                       as.character("1996-01-01"), as.character(ymd(camps_settlements$createdate)))
camps_settlements$createdate <- ymd(camps_settlements$createdate)
camps_settlements$closedate <- ifelse(camps_settlements$name %in% c("Kinkole"), 
                                      as.character("1998-01-01"), as.character(ymd(camps_settlements$closedate)))
camps_settlements$closedate <- ymd(camps_settlements$closedate)

# "Kondi-Mbaka", "Komi", and "Mavoadi" refugee camps in the Republic of Congo's 
# Kouilou province:
camps_settlements$createdate <- ifelse(camps_settlements$name %in% c("Kondi-Mbaka", "Komi", "Mavoadi"), 
                                       as.character("1993-01-01"), as.character(ymd(camps_settlements$createdate)))
camps_settlements$createdate <- ymd(camps_settlements$createdate)
camps_settlements$closedate <- ifelse(camps_settlements$name %in% c("Kondi-Mbaka", "Komi", "Mavoadi"), 
                                      as.character("2004-01-01"), as.character(ymd(camps_settlements$closedate)))
camps_settlements$closedate <- ymd(camps_settlements$closedate)

# "Bossou centre", "Gba", and "Thuo centre" refugee camps in Guinea's Lola province:
camps_settlements$createdate <- ifelse(camps_settlements$name %in% c("Bossou centre", "Gba", "Thuo centre"), 
                                       as.character("1990-01-01"), as.character(ymd(camps_settlements$createdate)))
camps_settlements$createdate <- ymd(camps_settlements$createdate)
camps_settlements$closedate <- ifelse(camps_settlements$name %in% c("Bossou centre", "Gba", "Thuo centre"), 
                                      as.character("2003-01-01"), as.character(ymd(camps_settlements$closedate)))
camps_settlements$closedate <- ymd(camps_settlements$closedate)

# Then, dealing with the large number of refugee camps in Guinea's Macenta province:
camps_settlements$createdate <- ifelse(camps_settlements$name %in% check1[check1$ADMIN_NAME == "Macenta", ]$name & 
                                         camps_settlements$loc_type %in% c("Refugee Camp"), 
                                       as.character("1997-01-01"), as.character(ymd(camps_settlements$createdate)))
camps_settlements$createdate <- ymd(camps_settlements$createdate)
camps_settlements$closedate <- ifelse(camps_settlements$name %in% check1[check1$ADMIN_NAME == "Macenta", ]$name & 
                                        camps_settlements$loc_type %in% c("Refugee Camp"), 
                                      as.character("2003-01-01"), as.character(ymd(camps_settlements$closedate)))
camps_settlements$closedate <- ymd(camps_settlements$closedate)

# "Samvura" refugee camp in Burundi's Makamba province:
camps_settlements$createdate <- ifelse(camps_settlements$name %in% c("Samvura"), 
                                       as.character("2007-01-01"), as.character(ymd(camps_settlements$createdate)))
camps_settlements$createdate <- ymd(camps_settlements$createdate)
camps_settlements$closedate <- ifelse(camps_settlements$name %in% c("Samvura"), 
                                      as.character("2009-01-01"), as.character(ymd(camps_settlements$closedate)))
camps_settlements$closedate <- ymd(camps_settlements$closedate)

# "Achol-Pii" and "Kyebitaka" refugee settlements in Uganda's Northern province:
camps_settlements$createdate <- ifelse(camps_settlements$name %in% c("Achol-Pii", "Kyebitaka"), 
                                       as.character("1996-01-01"), as.character(ymd(camps_settlements$createdate)))
camps_settlements$createdate <- ymd(camps_settlements$createdate)
camps_settlements$closedate <- ifelse(camps_settlements$name %in% c("Achol-Pii", "Kyebitaka"), 
                                      as.character("2003-01-01"), as.character(ymd(camps_settlements$closedate)))
camps_settlements$closedate <- ymd(camps_settlements$closedate)

# "Kola" refugee camp in Guinea's Nzerekore province:
camps_settlements$createdate <- ifelse(camps_settlements$name %in% c("Kola"), 
                                       as.character("2002-01-01"), as.character(ymd(camps_settlements$createdate)))
camps_settlements$createdate <- ymd(camps_settlements$createdate)
camps_settlements$closedate <- ifelse(camps_settlements$name %in% c("Kola"), 
                                      as.character("2005-01-01"), as.character(ymd(camps_settlements$closedate)))
camps_settlements$closedate <- ymd(camps_settlements$closedate)

# "Adré" refugee camp in Chad's Ouaddai province:
camps_settlements$createdate <- ifelse(camps_settlements$name %in% c("Adré"), 
                                       as.character("2005-01-01"), as.character(ymd(camps_settlements$createdate)))
camps_settlements$createdate <- ymd(camps_settlements$createdate)
camps_settlements$closedate <- ifelse(camps_settlements$name %in% c("Adré"), 
                                      as.character("2019-01-01"), as.character(ymd(camps_settlements$closedate)))
camps_settlements$closedate <- ymd(camps_settlements$closedate)

# "Kintele" the  Republic of the Congo's Pool province:
camps_settlements$createdate <- ifelse(camps_settlements$name %in% c("Kintele"), 
                                       as.character("1997-01-01"), as.character(ymd(camps_settlements$createdate)))
camps_settlements$createdate <- ymd(camps_settlements$createdate)
camps_settlements$closedate <- ifelse(camps_settlements$name %in% c("Kintele"), 
                                      as.character("1999-01-01"), as.character(ymd(camps_settlements$closedate)))
camps_settlements$closedate <- ymd(camps_settlements$closedate)

# "Grat Reda" refugee camp in Ethiopia's Tigray province:
camps_settlements$createdate <- ifelse(camps_settlements$name %in% c("Grat Reda"), 
                                       as.character("2000-01-01"), as.character(ymd(camps_settlements$createdate)))
camps_settlements$createdate <- ymd(camps_settlements$createdate)
camps_settlements$closedate <- ifelse(camps_settlements$name %in% c("Grat Reda"), 
                                      as.character("2019-01-01"), as.character(ymd(camps_settlements$closedate)))
camps_settlements$closedate <- ymd(camps_settlements$closedate)

# "Bamboudi (Bamboudi)" refugee camp in Ethiopia's Welega province:
camps_settlements$createdate <- ifelse(camps_settlements$name %in% c("Bamboudi (Bamboudi)"), 
                                       as.character("2001-01-01"), as.character(ymd(camps_settlements$createdate)))
camps_settlements$createdate <- ymd(camps_settlements$createdate)
camps_settlements$closedate <- ifelse(camps_settlements$name %in% c("Bamboudi (Bamboudi)"), 
                                      as.character("2019-01-01"), as.character(ymd(camps_settlements$closedate)))
camps_settlements$closedate <- ymd(camps_settlements$closedate)

# Finally, dealing with the many refugee camps in Guinea's Yomou province:
camps_settlements$createdate <- ifelse(camps_settlements$name %in% check1[check1$ADMIN_NAME == "Yomou", ]$name & 
                                         camps_settlements$loc_type %in% c("Refugee Camp"), 
                                       as.character("2001-01-01"), as.character(ymd(camps_settlements$createdate)))
camps_settlements$createdate <- ymd(camps_settlements$createdate)
camps_settlements$closedate <- ifelse(camps_settlements$name %in% check1[check1$ADMIN_NAME == "Yomou", ]$name & 
                                        camps_settlements$loc_type %in% c("Refugee Camp"), 
                                      as.character("2003-01-01"), as.character(ymd(camps_settlements$closedate)))
camps_settlements$closedate <- ymd(camps_settlements$closedate)

# Are there any cases of close before create left?
camps_settlements$close_before_create <- NULL
camps_settlements$close_before_create <- camps_settlements$createdate > camps_settlements$closedate
check1 <- camps_settlements[camps_settlements$close_before_create %in% "TRUE", ]
# No.

# Now, add the the camps and settlements data to the panel:
# First, create refugee and IDP camp and settlement indicator:
camps_settlements$idpc <- ifelse(camps_settlements$loc_type == "IDP Camp", 1, 0)
camps_settlements$idps <- ifelse(camps_settlements$loc_type == "IDP Settlement", 1, 0)
camps_settlements$rc <- ifelse(camps_settlements$loc_type == "Refugee Camp", 1, 0)
camps_settlements$rs <- ifelse(camps_settlements$loc_type == "Refugee Settlement", 1, 0)

# # # Now, at this stage in the code, in order to create a variable later that indicates whether a
# # # new site has been created, carry out the following steps:

# First, create camp and settlement (and camp and settlement) subsets:
temp_rc_rs <- camps_settlements[camps_settlements$rc %in% 1 | camps_settlements$rs %in% 1,  ]
temp_rc <- camps_settlements[camps_settlements$rc %in% 1,  ]
temp_rs <- camps_settlements[camps_settlements$rs %in% 1,  ]

# Next, determine for each subset for each province the unique years in which a new site opened:
# First, set each create date to the relevant year floor:
temp_rc_rs$create_year <- floor_date(temp_rc_rs$createdate, "year")
temp_rc$create_year <- floor_date(temp_rc$createdate, "year")
temp_rs$create_year <- floor_date(temp_rs$createdate, "year")

# Now, extract the unique years:
rc_rs_unique_years <- setDT(temp_rc_rs)[, list(open_years = unique(create_year)), by = GMI_ADMIN ]
rc_unique_years <- setDT(temp_rc)[, list(open_years = unique(create_year)), by = GMI_ADMIN ]
rs_unique_years <- setDT(temp_rs)[, list(open_years = unique(create_year)), by = GMI_ADMIN ]

# # # Now, return to normal script:

# Then, extract camp/settlement data for each year of the study:
camps_settlements_by_year <- list()
for(i in 1:length(1990:2019)){
  year_index <- list()
  for(j in 1:nrow(camps_settlements)){
    if((ymd("1989-01-01") + years(i)) %in% 
       seq(floor_date(camps_settlements$createdate[j], "year"), 
                      floor_date(camps_settlements$closedate[j], "year"), by = "years")){
      year_index[[j]] <- row.names(camps_settlements[j, ])
    }
  }
  year_index <- year_index[-which(sapply(year_index, is.null))]
  year_index <- rbind(year_index)
  year_index <- as.numeric((as.character(year_index)))
  camps_settlements_by_year[[i]] <- camps_settlements[year_index, ]
  # Then, create province-level IDP/refugee camp and settlement count and 
  # indicator variables:
  camps_settlements_by_year[[i]] <- camps_settlements_by_year[[i]] %>%
    group_by(GMI_ADMIN) %>%
    summarize(idpct = sum(idpc), idpst = sum(idps), rct = sum(rc), rst = sum(rs),
      idpcb = max(idpc), idpsb = max(idps), rcb = max(rc), rsb = max(rs))
  # Add year variable:
  camps_settlements_by_year[[i]]$Year <- ymd("1989-01-01") + years(i)
  # Then, match each yearly subset of provinces to the full country panel for that year;
  # however, if the year is 2019, save that data for later use:
  if(i == max(length(1990:2019))){
    temp_2019 <- camps_settlements_by_year[[i]]
  }
  camps_settlements_by_year[[i]] <- join(dataset[dataset$Year == (ymd("1989-01-01") + years(i)), ], camps_settlements_by_year[[i]])
  # Then, assign values of 0 of NAs (that is, for cases in which provinces had no IDP/
  # refugee camps or settlements):
  camps_settlements_by_year[[i]]$idpct[is.na(camps_settlements_by_year[[i]]$idpct)] <- 0
  camps_settlements_by_year[[i]]$idpst[is.na(camps_settlements_by_year[[i]]$idpst)] <- 0
  camps_settlements_by_year[[i]]$rct[is.na(camps_settlements_by_year[[i]]$rct)] <- 0
  camps_settlements_by_year[[i]]$rst[is.na(camps_settlements_by_year[[i]]$rst)] <- 0
  
  camps_settlements_by_year[[i]]$idpcb[is.na(camps_settlements_by_year[[i]]$idpcb)] <- 0
  camps_settlements_by_year[[i]]$idpsb[is.na(camps_settlements_by_year[[i]]$idpsb)] <- 0
  camps_settlements_by_year[[i]]$rcb[is.na(camps_settlements_by_year[[i]]$rcb)] <- 0
  camps_settlements_by_year[[i]]$rsb[is.na(camps_settlements_by_year[[i]]$rsb)] <- 0
}

dataset <- rbindlist(camps_settlements_by_year)

# Then create an indicator variable for either IDP and, separately, refugee camps 
# or settlements:
dataset$idpb <- ifelse(dataset$idpcb == 1 | dataset$idpsb == 1, 1, 0)
dataset$rtb <- ifelse(dataset$rcb == 1 | dataset$rsb == 1, 1, 0)

# Rename camp and settlement counts:
dataset <- plyr::rename(dataset, c("idpct"="idpc", "idpst"="idps", "rct"="rc", "rst"="rs"))

# Also, create count totals for IDP and, separately, refugee camps and settlements:
dataset$idpt <- dataset$idpc + dataset$idps
dataset$rt <- dataset$rc + dataset$rs

# Now, create indicator variable for refugee presence in another province of 
# a given country-year:
CntryYearSite <- dataset %>% 
  group_by(GMI_CNTRY, Year)  %>% 
  summarise(cntryyearrsb = sum(rsb))
dataset <- join(dataset, CntryYearSite, by = c("GMI_CNTRY", "Year"))

CntryYearSite <- dataset %>% 
  group_by(GMI_CNTRY, Year)  %>% 
  summarise(cntryyearrcb = sum(rcb))
dataset <- join(dataset, CntryYearSite, by = c("GMI_CNTRY", "Year"))

CntryYearSite <- dataset %>% 
  group_by(GMI_CNTRY, Year)  %>% 
  summarise(cntryyearrtb = sum(rtb))
dataset <- join(dataset, CntryYearSite, by = c("GMI_CNTRY", "Year"))

# Then, create variable indicating whether another province-year unit hosted 
# refugees regardless of whether the province-year unit itself does as well:
dataset$rsb.other <- ifelse(dataset$rsb == 0 & dataset$cntryyearrsb > 0, 1, 
                            ifelse(dataset$rsb == 1 & dataset$cntryyearrsb > 1, 1, 0))
dataset$rcb.other <- ifelse(dataset$rcb == 0 & dataset$cntryyearrcb > 0, 1, 
                            ifelse(dataset$rcb == 1 & dataset$cntryyearrcb > 1, 1, 0))
dataset$rtb.other <- ifelse(dataset$rtb == 0 & dataset$cntryyearrtb > 0, 1, 
                            #if no rtb in my province-year, but rtb in another province-year, 1
                            ifelse(dataset$rtb == 1 & dataset$cntryyearrtb > 1, 1, 0))
#if rtb in my province-year, but also rtb in another province-year, 1, else 0

# Add covariates to the dataset -- 

# Ruggdness data:
# SJM has two observations in ruggedness data because Svalbard and Jan Mayen are listed under separate
# contry names but not separate country codes.
# Drop Jan Mayen as before:
ruggedness <- ruggedness[-(1125), ]
dataset <- join(dataset, ruggedness[, c("GMI_ADMIN", "MIN", "MAX", "RANGE", "MEAN", "STD", "SQKM_ADMIN")], by = "GMI_ADMIN")

# First, Match the PRIO GRID data to the provinces:
# plot(prio_grid_centroids)
# crs(prio_grid_centroids)
crs(shape) <- crs(prio_grid_cells) <- crs(prio_grid_centroids)  <- crs(utm)
# Identify the provinces included in the dataset:
GMI_ADMIN <- unique(dataset$GMI_ADMIN)
# Then, match PRIO GRID centroids to provinces:
prio_grid_province_match <- cbind(prio_grid_centroids@data, over(prio_grid_centroids, shape[shape@data$GMI_ADMIN %in% GMI_ADMIN, ]))
# Subset to matches:
prio_grid_province_match <- prio_grid_province_match[!(is.na(prio_grid_province_match$GMI_ADMIN)), ]
# Then, identify those provinces that were not matched to a PRIO GRID:
unmatched_GMI_ADMIN <- GMI_ADMIN[which(!(GMI_ADMIN %in% prio_grid_province_match$GMI_ADMIN))]
# Next, for those unmatched provinces, intersect them with PRIO GRID cells: 
prio_grid_province_match_remaining <- raster::intersect(prio_grid_cells, shape[shape@data$GMI_ADMIN %in% unmatched_GMI_ADMIN, ])
prio_grid_province_match_remaining <- prio_grid_province_match_remaining@data
# Then, combine the two datasets:
prio_grid_matches <- rbind(prio_grid_province_match, prio_grid_province_match_remaining)

# Then, match the relevant PRIO cells to the covariates:
prio_grid <- join(prio_grid_matches, prio_grid)
# Create matching year variable:
prio_grid$Year <- ymd(paste(prio_grid$year, "-01", "-01", sep = ""))

# Note that there are a very small number of PRIO GRIDs (186 of more 
# than 1.5 million) that are not associated with data (but instead 
# appear to have came from the PRIO GRID centroid/cell datasets) that 
# are matched with provinces. These are, therefore, dropped:
prio_grid <- prio_grid[!(is.na(prio_grid$Year)), ]

# Now, note that, given that intersection matching process used above, 
# there are a number of cases where PRIO GRIDs are assigned to more than 
# one province. For purposes of generating estimates of population and 
# economic activity, only one set of grids is needed:
prio_grid_temp <- prio_grid
prio_grid_temp$unique <- paste(prio_grid$gid, prio_grid$year, sep = "_")
prio_grid_temp <- prio_grid_temp[!duplicated(prio_grid_temp$unique),]

# Linearly impute missing years for population data and then add to dataset:
# Sort PRIO GRID dataset:
prio_grid_temp <- prio_grid_temp[with(prio_grid_temp, order(gid, Year)), ]
row.names(prio_grid_temp) <- 1:nrow(prio_grid_temp)
pop <- prio_grid_temp[prio_grid_temp$year >= 1990, c("gid","Year","pop_gpw_sum")]
row.names(pop) <- 1:nrow(pop)
pop2 <- tapply(pop$pop_gpw_sum, pop$gid, function(x) approxfun(1:25, 
                                                               x, method = "linear", rule = 2)(1:25))

for(i in 1:length(pop2)){
  yearly_diff_09_10 <- pop2[[i]][21] - pop2[[i]][20]
  pop2[[i]][23] <- pop2[[i]][23] + yearly_diff_09_10
  pop2[[i]][24] <- pop2[[i]][24] + yearly_diff_09_10*2
  pop2[[i]][25] <- pop2[[i]][25] + yearly_diff_09_10*3
  
  pop2[[i]][26] <- pop2[[i]][27] <-
  pop2[[i]][28] <- pop2[[i]][29] <- NA
    
  pop2[[i]][26] <- pop2[[i]][21] + yearly_diff_09_10*4
  pop2[[i]][27] <- pop2[[i]][21] + yearly_diff_09_10*5
  pop2[[i]][28] <- pop2[[i]][21] + yearly_diff_09_10*6
  pop2[[i]][29] <- pop2[[i]][21] + yearly_diff_09_10*7
}

# Extend population data through 2018:
Year <- seq(min(pop$Year), ymd("2018-01-01"), by = "years")
Year <- as.data.frame(Year)
# Extract vector of relevant GRIDs:
gid <- unique(pop$gid)

list <- list()
for(i in 1:length(gid)){
  list[[i]] <- Year
  list[[i]]$gid <- gid[i]
}

df <- as.data.frame(rbindlist(list))
df$pop_gpw_sum <- unlist(pop2)
# Then, match province names to grid cells:
grid_province_match <- distinct(prio_grid[, c("gid", "GMI_ADMIN")])
pop <- plyr::join(df, grid_province_match, by = "gid", match = "first")
# pop <- join(df, prio_grid[, c("gid", "GMI_ADMIN", "Year")], by = c("gid", "Year"), type = "left")

# Then, aggregate to the province-year:
pop2  <- pop  %>% 
  group_by(GMI_ADMIN, Year)  %>% 
  summarise(pop3 = sum(pop_gpw_sum, na.rm = TRUE))
colnames(pop2)[3] <- "pop"
dataset <- left_join(dataset, pop2[, c("Year", "GMI_ADMIN", "pop")], 
                     by = c("GMI_ADMIN", "Year"))

# Created logged population variable:
# First, add 1 for 0 values:
dataset$pop <- dataset$pop + 1
dataset$log_pop <- log(dataset$pop)

# Next, create a province-level measure of economic activity:
# First, sum up to province levels
gdp2  <- prio_grid_temp  %>%  # sum up to province levels
  group_by(GMI_ADMIN, year)  %>% 
  summarise(gcp_ppp = sum(gcp_ppp, na.rm = T))

# Are any values of 0 assigned?
summary(prio_grid_temp[!(is.na(prio_grid_temp$gcp_ppp)) & 
                         prio_grid_temp$year %in% c(1990, 1995, 2000, 2005), ]$gcp_ppp)

# Yes. However, because we sum GCP PPP (and are not averaging), we can 
# ignore values of 0 and assign NAs.
gdp2$gcp_ppp[gdp2$gcp_ppp == 0] <- NA

# However, some provinces have NAs for all values:
temp <- gdp2 %>%
  group_by(GMI_ADMIN) %>%
  summarise(test = length(which(is.na(gcp_ppp))))
table(temp$test)
# 22   23   24   25   26 
# 2101   89   34  108   18 

# 18 cases of NAs for all; 108 cases of a single value; 34 cases of 
# two values; and 89 cases of three values. The remaining 2101 provinces
# have data for all four years (1990, 1995, 2000, 2005).

# Because we cannot interpolate for provinces with all NAs, we first 
# remove those provinces:
gmiNA <- names(which(is.na(tapply(gdp2$gcp_ppp, gdp2$GMI_ADMIN, mean, na.rm = T)) == T))
gdp2$GMI_ADMIN <- as.character(gdp2$GMI_ADMIN)
gdp3 <- gdp2[!(gdp2$GMI_ADMIN %in% gmiNA),] 
# Confirm that they have been dropped:
which(is.na(tapply(gdp3$gcp_ppp, gdp3$GMI_ADMIN, mean, na.rm = T)) == T)

# Extend years through 2018 and drop 1989:
Year <- seq(min(pop$Year), ymd("2018-01-01"), by = "years")
Year <- as.data.frame(Year)
# Extract vector of relevant GRIDs:
GMI_ADMIN <- unique(gdp3$GMI_ADMIN)

list <- list()
for(i in 1:length(GMI_ADMIN)){
  list[[i]] <- Year
  list[[i]]$GMI_ADMIN <- GMI_ADMIN[i]
}

df <- as.data.frame(rbindlist(list))

# First, create year variable in GDP data:
gdp3$Year <- ymd(paste(gdp3$year, "01", "01", sep = "-"))
# Then, add years:
gdp3 <- join(df, gdp3[, c("GMI_ADMIN", "gcp_ppp", "Year" )], by = c("GMI_ADMIN", "Year"), type = "left")

# Then, for provinces with just one year of gcp_ppp data, use that 
# value for entire timespan:
gmiONE <- names(which((tapply(gdp2$gcp_ppp, gdp2$GMI_ADMIN, 
                              function(x) sum(is.na(x)) == length(x)-1)) == T))
gdp4 <- gdp3[gdp3$GMI_ADMIN %in% gmiONE & is.na(gdp3$gcp_ppp) == F, c("GMI_ADMIN", "gcp_ppp")]
gdp5 <- merge(gdp3, gdp4, by = "GMI_ADMIN", all.x = T)
gdp5$gcp_ppp <- ifelse(gdp5$GMI_ADMIN %in% gmiONE, gdp5$gcp_ppp.y, gdp5$gcp_ppp.x)

# Reorder gdp data:
gdp5 <- gdp5[order(gdp5$GMI_ADMIN, gdp5$Year),]
row.names(gdp5) <- 1:nrow(gdp5)

# Now, linearly interpolate for provinces with at least two observations:
gdp6 <- tapply(gdp5$gcp_ppp, gdp5$GMI_ADMIN, function(x) 
  approxfun(1:29, x, method = "linear", rule = 2)(1:29))

for(i in 1:length(gdp6)){
  
  yearly_diff_04_05 <- gdp6[[i]][16] - gdp6[[i]][15]
  
  gdp6[[i]][17] <- gdp6[[i]][18] <-
  gdp6[[i]][19] <- gdp6[[i]][20] <-
  gdp6[[i]][21] <- gdp6[[i]][22] <-
  gdp6[[i]][23] <- gdp6[[i]][24] <-
  gdp6[[i]][25] <- gdp6[[i]][26] <-
  gdp6[[i]][27] <- gdp6[[i]][28] <- 
  gdp6[[i]][29] <- NA
  
  gdp6[[i]][17] <- gdp6[[i]][16] + yearly_diff_04_05
  gdp6[[i]][18] <- gdp6[[i]][16] + yearly_diff_04_05*2
  gdp6[[i]][19] <- gdp6[[i]][16] + yearly_diff_04_05*3
  gdp6[[i]][20] <- gdp6[[i]][16] + yearly_diff_04_05*4
  gdp6[[i]][21] <- gdp6[[i]][16] + yearly_diff_04_05*5
  gdp6[[i]][22] <- gdp6[[i]][16] + yearly_diff_04_05*6
  gdp6[[i]][23] <- gdp6[[i]][16] + yearly_diff_04_05*7
  gdp6[[i]][24] <- gdp6[[i]][16] + yearly_diff_04_05*8
  gdp6[[i]][25] <- gdp6[[i]][16] + yearly_diff_04_05*9
  gdp6[[i]][26] <- gdp6[[i]][16] + yearly_diff_04_05*10
  gdp6[[i]][27] <- gdp6[[i]][16] + yearly_diff_04_05*11
  gdp6[[i]][28] <- gdp6[[i]][16] + yearly_diff_04_05*12
  gdp6[[i]][29] <- gdp6[[i]][16] + yearly_diff_04_05*13
}

gdp5$gcp_ppp <- unlist(gdp6) 
dataset <- left_join(dataset, gdp5[, c("Year", "GMI_ADMIN", "gcp_ppp")], 
                     by = c("GMI_ADMIN", "Year"))

# Now, for a very small number of cases, negative values of gcp_ppp will have been generated
# through the linear interpolation process. For these, set to 0:
dataset$gcp_ppp <- ifelse(dataset$gcp_ppp < 0, 0, dataset$gcp_ppp)

# Next, add distance measures:
temp <- prio_grid  %>% 
  group_by(GMI_ADMIN)  %>% 
  summarise(bdist1 = mean(bdist1, na.rm = TRUE), 
            bdist2 = mean(bdist2, na.rm = TRUE), 
            bdist3 = mean(bdist3, na.rm = TRUE), 
            capdist = mean(capdist, na.rm = TRUE))
dataset <- join(dataset, temp, by = "GMI_ADMIN")
rm(temp)

# Create logged border distance `bdist2` and capital distance `capdist` measures:
dataset$log_bdist2 <- log(dataset$bdist2 + 1)
dataset$log_capdist <- log(dataset$capdist + 1)

# Create two war border variables -- average and total number of attacks in all contiguous 
# provinces:

# First, modify adjacency matrix:
adjdf <- as.data.frame(adjmat[[1]])
adjdf <- adjdf[-(1125), -(1125)]
names <- shape@data$GMI_ADMIN
rownames(adjdf) <- names
colnames(adjdf) <- names

# Then, create attack and battledeath indicator variables:
ged_90$incident_state_based <- ifelse(ged_90$type_of_violence %in% 1, 1, 0)
ged_90$incident_non_state <- ifelse(ged_90$type_of_violence %in% 2, 1, 0)
ged_90$incident_one_sided <- ifelse(ged_90$type_of_violence %in% 3, 1, 0)
ged_90$best_state_based <- ifelse(ged_90$type_of_violence %in% 1, ged_90$best, 0)
ged_90$best_non_state <- ifelse(ged_90$type_of_violence %in% 2, ged_90$best, 0)
ged_90$best_one_sided <- ifelse(ged_90$type_of_violence %in% 3, ged_90$best, 0)

# Create year variable:
ged_90$Year <- ymd(paste(ged_90$year, 01, 01, sep = "-"))

neighbor_list <- list()
# For each province:
for(i in 1:length(adjdf)){
  # If that province is included in the set we study:
  if(row.names(adjdf)[i] %in% dataset$GMI_ADMIN){
    # Determine its contiguous neighbors:
    temp <- which(adjdf[i, ] == 1)
    # Does province have adjacencies? Only move forward with processing if so:
    if(length(temp) > 1){
      # Then, remove province itself:
      temp <- temp[!temp %in% i]
      # Then, use list of identified neighbors to subset GED incident-level dataset:
      ged_temp <- ged_90[ged_90$GMI_ADMIN %in% row.names(adjdf[temp, ]), ]
      # Next, aggregate/average observations for provices for which data exists:
      # Note that some province-years will not meet this and the threshold and will simply 
      # be assigned NAs during subsequent matching with the complete panel, 
      # at which point they will be assigned NAs:
      # First, calculate the sums by type:
      if(nrow(ged_temp) > 0){
        neighbors_sums_means <- ged_temp %>%
          group_by(Year) %>%
          summarise(attack_neighbors_sum = sum(incident, na.rm = T), 
                    attack_state_based_neighbors_sum = sum(incident_state_based, na.rm = T), 
                    attack_non_state_neighbors_sum = sum(incident_non_state, na.rm = T), 
                    attack_one_sided_neighbors_sum = sum(incident_one_sided, na.rm = T),
                    best_neighbors_sum = sum(best, na.rm = T), 
                    best_state_based_neighbors_sum = sum(best_state_based, na.rm = T), 
                    best_non_state_neighbors_sum = sum(best_non_state, na.rm = T), 
                    best_one_sided_neighbors_sum = sum(best_one_sided, na.rm = T))
        # Next, calculate averages (that is, mean attacks/battledeaths per province).
        neighbors_sums_means[, 10:17] <- neighbors_sums_means[, 2:9] / length(temp)
        colnames(neighbors_sums_means)[10:17] <- c("attack_neighbors_mean","attack_state_based_neighbors_mean", 
                                                   "attack_non_state_neighbors_mean", "attack_one_sided_neighbors_mean",
                                                   "best_neighbors_mean",  "best_state_based_neighbors_mean", 
                                                   "best_non_state_neighbors_mean", "best_one_sided_neighbors_mean")
        # Add province name:
        neighbors_sums_means$GMI_ADMIN <- row.names(adjdf[i, ])
        
        # Store data:
        neighbor_list[[i]] <- neighbors_sums_means
      }
    }
  }
}

# Then, recombine:
neighbor_partial_panel <- rbindlist(neighbor_list)

# Next, add to dataset:
dataset <- join(dataset, neighbor_partial_panel)

# Assign 0s to NAs:
# Attack sums:
dataset$attack_neighbors_sum[is.na(dataset$attack_neighbors_sum)] <- 0
dataset$attack_state_based_neighbors_sum[is.na(dataset$attack_state_based_neighbors_sum)] <- 0
dataset$attack_non_state_neighbors_sum[is.na(dataset$attack_non_state_neighbors_sum)] <- 0
dataset$attack_one_sided_neighbors_sum[is.na(dataset$attack_one_sided_neighbors_sum)] <- 0
# Attack means:
dataset$attack_neighbors_mean[is.na(dataset$attack_neighbors_mean)] <- 0
dataset$attack_state_based_neighbors_mean[is.na(dataset$attack_state_based_neighbors_mean)] <- 0
dataset$attack_non_state_neighbors_mean[is.na(dataset$attack_non_state_neighbors_mean)] <- 0
dataset$attack_one_sided_neighbors_mean[is.na(dataset$attack_one_sided_neighbors_mean)] <- 0
# Battledeath sums:
dataset$best_neighbors_sum[is.na(dataset$best_neighbors_sum)] <- 0
dataset$best_state_based_neighbors_sum[is.na(dataset$best_state_based_neighbors_sum)] <- 0
dataset$best_non_state_neighbors_sum[is.na(dataset$best_non_state_neighbors_sum)] <- 0
dataset$best_one_sided_neighbors_sum[is.na(dataset$best_one_sided_neighbors_sum)] <- 0
# Battledeath means:
dataset$best_neighbors_mean[is.na(dataset$best_neighbors_mean)] <- 0
dataset$best_state_based_neighbors_mean[is.na(dataset$best_state_based_neighbors_mean)] <- 0
dataset$best_non_state_neighbors_mean[is.na(dataset$best_non_state_neighbors_mean)] <- 0
dataset$best_one_sided_neighbors_mean[is.na(dataset$best_one_sided_neighbors_mean)] <- 0

# Create a variable that indicates whether the sum of battledeaths in neighboring provinces
# is at least one:
dataset$best_neighbors_sum_binary <- ifelse(dataset$best_neighbors_sum > 0, 1, 0)

# Next, create a civil war border variable using conflict cirlces.

neighbor_list_circles <- list()
# For each province:
for(i in 1:length(adjdf)){
  # If that province is included in the set we study:
  if(row.names(adjdf)[i] %in% dataset$GMI_ADMIN){
    # Determine its contiguous neighbors:
    temp <- which(adjdf[i, ] == 1)
    # Does province have adjacencies? Only move forward with processing if so:
    if(length(temp) > 1){
      # Then, remove province itself:
      temp <- temp[!temp %in% i]
      # Then, use list of identified neighbors to subset to neighboring province-years 
      # experiencing civil war:
      civil_war_temp <- dataset[dataset$incidence %in% 1 & 
                                  dataset$GMI_ADMIN %in% row.names(adjdf[temp, ]), c("GMI_ADMIN", "Year", "incidence")]
      # Next, determine by year, whether each province is bordering at least one other province
      # experiencing civil war:
      if(nrow(civil_war_temp) > 0){
        civil_war_temp <- civil_war_temp %>%
          group_by(Year) %>%
          summarise(incidence_neighbor = max(incidence, na.rm = T))
      # Rename incidence in neighbor variable:
      colnames(civil_war_temp)[2] <- "incidence_neighbor"
      # Add province name:
      civil_war_temp$GMI_ADMIN <- row.names(adjdf[i, ])
      # Store data:
      neighbor_list_circles[[i]] <- civil_war_temp
      }
    }
  }
}

# Then, recombine:
neighbor_partial_panel_circles <- rbindlist(neighbor_list_circles)

# Next, add to dataset:
dataset <- join(dataset, neighbor_partial_panel_circles)

# Finally, assign 0s to NAs (between 1990 and 2008 only):
dataset$incidence_neighbor <- ifelse(dataset$Year %in% 
                                       seq(ymd("1990-01-01"), ymd("2008-01-01"), by = "years") &
                                       is.na(dataset$incidence_neighbor), 
                                     0, dataset$incidence_neighbor)

# Next, create a civil war border variable using wzone polygons:

neighbor_list_wzone <- list()
# For each province:
for(i in 1:length(adjdf)){
  # If that province is included in the set we study:
  if(row.names(adjdf)[i] %in% dataset$GMI_ADMIN){
    # Determine its contiguous neighbors:
    temp <- which(adjdf[i, ] == 1)
    # Does province have adjacencies? Only move forward with processing if so:
    if(length(temp) > 1){
      # Then, remove province itself:
      temp <- temp[!temp %in% i]
      # Then, use list of identified neighbors to subset to neighboring province-years 
      # experiencing civil war:
      civil_war_temp <- dataset[dataset$incidence_wzone %in% 1 & 
                                  dataset$GMI_ADMIN %in% row.names(adjdf[temp, ]), c("GMI_ADMIN", "Year", "incidence_wzone")]
      # Next, determine by year, whether each province is bordering at least one other province
      # experiencing civil war:
      if(nrow(civil_war_temp) > 0){
        civil_war_temp <- civil_war_temp %>%
          group_by(Year) %>%
          summarise(incidence_neighbor = max(incidence_wzone, na.rm = T))
        # Rename incidence in neighbor variable:
        colnames(civil_war_temp)[2] <- "incidence_wzone_neighbor"
        # Add province name:
        civil_war_temp$GMI_ADMIN <- row.names(adjdf[i, ])
        # Store data:
        neighbor_list_wzone[[i]] <- civil_war_temp
      }
    }
  }
}

# Then, recombine:
neighbor_partial_panel_wzone <- rbindlist(neighbor_list_wzone)

# Next, add to dataset:
dataset <- join(dataset, neighbor_partial_panel_wzone)

# Finally, assign 0s to NAs (between 1990 and 2008 only):
dataset$incidence_wzone_neighbor <- ifelse(dataset$Country %in% 
                                       "Syria" &
                                       is.na(dataset$incidence_wzone_neighbor), 
                                     0, dataset$incidence_wzone_neighbor)


# Create a war-border variable:
# NOTE: This variable determines whether a given province
# borders another province that is:
# a) experiencing a civil war; b) located in a foreign country.

adjdf_temp <- adjdf

# Complete column entries:
adjdf_temp$GMI_ADMIN <- names
adjdf_temp <- adjdf_temp[order(adjdf_temp$GMI_ADMIN), ]
adjdf_temp$GMI_ADMIN <- NULL
adjdf_temp <- t(adjdf_temp)
adjdf_temp <- as.data.frame(adjdf_temp)
adjdf_temp$GMI_ADMIN <- names
adjdf_temp <- adjdf_temp[order(adjdf_temp$GMI_ADMIN), ]
adjdf_temp$GMI_ADMIN <- NULL

# Reorder dataset:
dataset <- dataset[order(dataset$GMI_ADMIN), ]
row.names(dataset) <- 1:nrow(dataset)

dataset$warborder <- rep(NA)
data <- list()
for(h in 1:length(1990:2018)){
  data[[h]] <- dataset[dataset$Year == ymd(paste(h + 1989, "01", "01", sep = "-")), ]
  data[[h]]$GMI_ADMIN <- as.factor(data[[h]]$GMI_ADMIN)

  # Subset adjacency matrix to match only relevant provinces:
  relevant_province_numbers <- which(row.names(adjdf_temp) %in% dataset[dataset$Year %in% ymd(paste(h + 1989, "01", "01", sep = "-")), ]$GMI_ADMIN)
  temp_adjdf <- adjdf_temp[relevant_province_numbers, relevant_province_numbers]

  for(i in 1:length(row.names(temp_adjdf[which(row.names(temp_adjdf) %in%
                                          dataset[dataset$Year %in% ymd(paste(h + 1989, "01", "01", sep = "-")), ]$GMI_ADMIN)]))){
    a <- (data[[h]]$GMI_CNTRY[i]) == (data[[h]]$GMI_CNTRY)
    b <- (-1*a + 1)
    b <- as.data.frame(b)

    temp <- cbind(as.data.frame(data[[h]]$best), temp_adjdf[ , i], b)
    
    # as.data.frame(data[[h]]$best) gives each province's battle deaths.
    # temp_adjdf[ , i] gives vector of adjacencies for province i.
    # b indicates if each province is in foreign country (1 if foreign).
    
    temp$match <- temp[ , 1] * temp[ , 2] * temp[ , 3]
    # Thus, "match" returns vector of battle deaths for only those provinces which, relative to
    # province i, are contiguous and in a foreign country.
    
    temp$foreign_contiguous_match <- temp[ , 2] * temp[ , 3]
    
    # Now, create a variable for each province indicating whether or not it bordered a 
    # foreign province with at least one battledeath:

    DATA <- data[[h]]
    if(sum(temp$match, na.rm = T) >= 5){
      DATA$best_foreign_neighbors_sum_binary[i] <- 1
    } else {
      DATA$best_foreign_neighbors_sum_binary[i] <- 0
    }
    
    # We also want to know whether a given province is contiguous to a foreign country
    # generally:
    if(max(temp$foreign_contiguous_match, na.rm = T) %in% 1){
      DATA$border_province[i] <- 1
    } else {
      DATA$border_province[i] <- 0
    }
    data[[h]] <- DATA
  }
}

dataset <- as.data.frame(rbindlist(data))
rm(data)

# Add excluded groups measures:
# First, are there any province-years for which no data exists? If so, need to be careful in 
# distinguishing between 0s and NAs:
no_ethnicity_data <- prio_grid_temp %>%
  group_by(GMI_ADMIN, Year) %>%
  summarise(no_data = all(is.na(excluded)))
# There are many (11,603) cases where there is no data available:
# Thus, first subset those cases out of the data and first work with those cases where
# data (from at least one PRIO GRID) exists:
# First, generate list of affected province-years:
no_ethnicity_data$GMI_ADMIN_Year <- paste(no_ethnicity_data$GMI_ADMIN, no_ethnicity_data$Year, sep = "_")
no_ethnicity_data <- unique(no_ethnicity_data[no_ethnicity_data$no_data == T, ]$GMI_ADMIN_Year)
prio_grid_temp$GMI_ADMIN_Year <- paste(prio_grid_temp$GMI_ADMIN, prio_grid_temp$Year, sep = "_")
prio_grid_temp_ethicity_data <- prio_grid_temp[!(prio_grid_temp$GMI_ADMIN_Year %in% no_ethnicity_data), ]

# Then, calculate for each province-year mean and aggregate measures of the number of 
# excluded groups:
temp <- prio_grid_temp_ethicity_data  %>% 
  group_by(GMI_ADMIN, Year)  %>% 
  summarise(excluded_mean = mean(excluded, na.rm = T), excluded_sum = sum(excluded, na.rm = TRUE))
# Then, add to dataset:
dataset <- join(dataset, temp, by = c("GMI_ADMIN", "Year"))

# Finally, for those provinces for which ethnicity data exists, apply the average value to 
# the years following the last year for which data was available: 2014 through 2018:
# Calculate province averages:
temp <- dataset %>%
  group_by(GMI_ADMIN) %>%
  summarise(excluded_mean_province = mean(excluded_mean, na.rm = T), 
            excluded_sum_province = mean(excluded_sum, na.rm = T))
# Then, extend those values to years 2014 through 2018:
dataset <- join(dataset, temp)
dataset$excluded_mean <- ifelse(dataset$Year > c(ymd("2013-01-01")), dataset$excluded_mean_province, dataset$excluded_mean)

# Add nighttime lights:
# First, are there any province-years for which no data exists? If so, need to be careful in 
# distinguishing between 0s and NAs:
no_lights_data <- prio_grid_temp %>%
  group_by(GMI_ADMIN, Year) %>%
  summarise(no_data = all(is.na(nlights_calib_mean)))
# There are.
# Thus, first subset those cases out of the data and first work with those cases where
# data (from at least one PRIO GRID) exists:
# First, generate list of affected province-years:
no_lights_data$GMI_ADMIN_Year <- paste(no_lights_data$GMI_ADMIN, no_lights_data$Year, sep = "_")
no_lights_data <- unique(no_lights_data[no_lights_data$no_data == T, ]$GMI_ADMIN_Year)
prio_grid_temp_lights_data <- prio_grid_temp[!(prio_grid_temp$GMI_ADMIN_Year %in% no_lights_data), ]
# Calculate for each province-year means:
temp <- prio_grid_temp_lights_data  %>% 
  group_by(GMI_ADMIN, Year)  %>% 
  summarise(nlights_calib_mean = mean(nlights_calib_mean, na.rm = T), nlights_mean = mean(nlights_mean, na.rm = TRUE))
# Then, add to dataset:
dataset <- join(dataset, temp, by = c("GMI_ADMIN", "Year"))

# Now, calculate a measure of neighboring province nighttime lights:
nighttime_lights_yearly_averages <- temp
nighttime_lights_neighbor_list <- list()
# For each province:
for(i in 1:length(adjdf)){
  # If that province is included in the set we study:
  if(row.names(adjdf)[i] %in% dataset$GMI_ADMIN){
    # Determine its contiguous neighbors:
    temp <- which(adjdf[i, ] == 1)
    # Does province have adjacencies? Only move forward with processing if so:
    if(length(temp) > 1){
      # Then, remove province itself:
      temp <- temp[!temp %in% i]
      # Then, use list of identified neighbors to subset nighttime lights yearly averages dataset:
      nighttime_lights_temp <- nighttime_lights_yearly_averages[nighttime_lights_yearly_averages$GMI_ADMIN 
                                                                 %in% row.names(adjdf[temp, ]), ]
      
      # Next, calculate yearly neighbor averages:
      if(nrow(nighttime_lights_temp) > 0){
        nighttime_lights_neighbors_means <- nighttime_lights_temp %>%
          group_by(Year) %>%
          summarise(nlights_calib_mean_neighbor = mean(nlights_calib_mean, na.rm = T), nlights_mean_neighbor = mean(nlights_mean, na.rm = TRUE))
      
        # Add province name:
        nighttime_lights_neighbors_means$GMI_ADMIN <- row.names(adjdf[i, ])
        
        # Store data:
        nighttime_lights_neighbor_list[[i]] <- nighttime_lights_neighbors_means
      }
    }
  }
}

# Then, recombine:
nighttime_lights_neighbor_partial_panel <- rbindlist(nighttime_lights_neighbor_list)

# Next, add to dataset:
dataset <- join(dataset, nighttime_lights_neighbor_partial_panel)

# Finally, add a measure of urban development:
# For this, we use the "urban_ih" variable available through PRIO-GRID
# which "gives the percentage area of the cell covered by urban area, based
# on ISAM-HYDE landuse data."
# First, note that data for this variable is available only for 1990, 2000, and
# 2010; thus, first create subset for just those years:
urban <- prio_grid_temp[prio_grid_temp$Year %in% c(ymd("1990-01-01"),
                                                   ymd("2000-01-01"),
                                                   ymd("2010-01-01")), ]
# Note that there are some cases of NAs (although relatively very few):
nrow(urban[is.na(urban$urban_ih) ,])
# 1122
nrow(urban[is.na(urban$urban_ih) ,]) / nrow(urban)
# Roughly 0.6% of cases. 
# Next, calculate the average percentage across all cells associated with each 
# province:
temp <- urban  %>% 
  group_by(GMI_ADMIN, Year)  %>% 
  summarise(urban_ih = mean(urban_ih, na.rm = T))
# Then, add to dataset:
dataset <- join(dataset, temp, by = c("GMI_ADMIN", "Year"))

# Create lagged variables:
# Sort dataset:
dataset <- dataset[with(dataset, order(GMI_ADMIN, Year)), ]
# Renumber rows:
row.names(dataset) <- 1:nrow(dataset)
# Create vector of variables to lag:
vars_to_lag <- c("attack", "attack_state_based", "attack_non_state", "attack_one_sided",
                 "best", "best_state_based", "best_non_state", "best_one_sided",
                 "best_neighbors_sum", "best_neighbors_mean",
                 "best_state_based_neighbors_sum", "best_state_based_neighbors_mean",
                 "best_non_state_neighbors_sum", "best_non_state_neighbors_mean",
                 "best_one_sided_neighbors_sum", "best_one_sided_neighbors_mean",

                 "attack_neighbors_sum", "attack_neighbors_mean",
                 "attack_state_based_neighbors_sum", "attack_state_based_neighbors_mean",
                 "attack_non_state_neighbors_sum", "attack_non_state_neighbors_mean",
                 "attack_one_sided_neighbors_sum", "attack_one_sided_neighbors_mean",
                 
                 "incidence", "onset.n", "incidence_neighbor",
                 
                 "incidence_wzone", "onset.n_wzone", "incidence_wzone_neighbor",
                 
                 "rcb", "rsb", "idpb", "rtb", "rtb.other",
                 "gcp_ppp", "log_pop", "nlights_mean", "nlights_calib_mean",
                 "nlights_mean_neighbor", "nlights_calib_mean_neighbor", 
                 "urban_ih")

temp <- dataset
for(i in 1:length(vars_to_lag)){
  for(j in 1:5){
    temp <- slide(temp, Var = vars_to_lag[i], GroupVar = "GMI_ADMIN",
                  slideBy = -j)
    temp <- slide(temp, Var = vars_to_lag[i], GroupVar = "GMI_ADMIN",
                  slideBy = j)
  }  
}

names(temp) <- gsub(x = names(temp),
                    pattern = "-",
                    replacement = "_")

dataset <- temp

# Add provice size, continent and region variables to dataset:
GMI_ADMIN_SQKM <- shape@data[, c("GMI_ADMIN", "CONTINENT", "REGION")]
row.names(GMI_ADMIN_SQKM) <- 1:nrow(GMI_ADMIN_SQKM)
# Join with dataset:
dataset <- left_join(dataset, GMI_ADMIN_SQKM, by = c("GMI_ADMIN"))

# Finally, create a variable (using work done earlier in the code) that indicates for 
# each province-year whether 1) any new refugee site came into existence in that year; 
# 2) a new refugee settlement came into existence; or 3) a new refugee camp came into 
# existence. 
rc_rs_unique_years$new_site_rt_1 <- 1
rc_unique_years$new_site_rc_1 <- 1
rs_unique_years$new_site_rs_1 <- 1
colnames(rc_rs_unique_years)[2] <- colnames(rc_unique_years)[2] <- colnames(rs_unique_years)[2] <- "Year"

dataset <- plyr::join(dataset, rc_rs_unique_years)
dataset <- plyr::join(dataset, rc_unique_years)
dataset <- plyr::join(dataset, rs_unique_years)

dataset$new_site_rt_1[is.na(dataset$new_site_rt_1)] <- 0
dataset$new_site_rc_1[is.na(dataset$new_site_rc_1)] <- 0
dataset$new_site_rs_1[is.na(dataset$new_site_rs_1)] <- 0

# Then, construct a new set of variables that indicates instead whether a given province-year is 
# associated with new sites over either the current or past year:

# For each province, construct following-year indicators:
rc_rs_unique_years_2 <- rc_rs_unique_years
rc_unique_years_2 <- rc_unique_years
rs_unique_years_2 <- rs_unique_years

rc_rs_unique_years_2$Year <- rc_rs_unique_years_2$Year + lubridate::years(1)
rc_unique_years_2$Year <- rc_unique_years_2$Year + lubridate::years(1)
rs_unique_years_2$Year <- rs_unique_years_2$Year + lubridate::years(1)

colnames(rc_rs_unique_years_2)[3] <- "new_site_rt_2"
colnames(rc_unique_years_2)[3] <- "new_site_rc_2"
colnames(rs_unique_years_2)[3] <- "new_site_rs_2"

dataset <- plyr::join(dataset, rc_rs_unique_years_2)
dataset <- plyr::join(dataset, rc_unique_years_2)
dataset <- plyr::join(dataset, rs_unique_years_2)

dataset$new_site_rt_2[is.na(dataset$new_site_rt_2)] <- 0
dataset$new_site_rc_2[is.na(dataset$new_site_rc_2)] <- 0
dataset$new_site_rs_2[is.na(dataset$new_site_rs_2)] <- 0

dataset$new_site_rt_2 <- rowSums(dataset[, c("new_site_rt_1", "new_site_rt_2")])
dataset$new_site_rc_2 <- rowSums(dataset[, c("new_site_rc_1", "new_site_rc_2")])
dataset$new_site_rs_2 <- rowSums(dataset[, c("new_site_rs_1", "new_site_rs_2")])

dataset$new_site_rt_2 <- ifelse(dataset$new_site_rt_2 > 0, 1, 0)
dataset$new_site_rc_2 <- ifelse(dataset$new_site_rc_2 > 0, 1, 0)
dataset$new_site_rs_2 <- ifelse(dataset$new_site_rs_2 > 0, 1, 0)

summary(dataset$new_site_rt_1)
summary(dataset$new_site_rt_2)

# Then, create indicator variables for new-site refugee presence in another province of 
# a given country-year:
CntryYearNewSite <- dataset %>% 
  group_by(GMI_CNTRY, Year)  %>% 
  summarise(cntryyearrt_1 = sum(new_site_rt_1), 
            cntryyearrc_1 = sum(new_site_rc_1), 
            cntryyearrs_1 = sum(new_site_rs_1), 
            cntryyearrt_2 = sum(new_site_rt_2), 
            cntryyearrc_2 = sum(new_site_rc_2), 
            cntryyearrs_2 = sum(new_site_rs_2))
dataset <- join(dataset, CntryYearNewSite, by = c("GMI_CNTRY", "Year"))

# Then, create variable indicating whether another province-year unit hosted 
# refugees within the past one to two years regardless of whether the province-year 
# unit itself does as well:
dataset$new_site_rt_1.other <- ifelse(dataset$new_site_rt_1 == 0 & dataset$cntryyearrt_1 > 0, 1, 
                                      ifelse(dataset$new_site_rt_1 == 1 & dataset$cntryyearrt_1 > 1, 1, 0))
dataset$new_site_rc_1.other <- ifelse(dataset$new_site_rc_1 == 0 & dataset$cntryyearrc_1 > 0, 1, 
                                      ifelse(dataset$new_site_rc_1 == 1 & dataset$cntryyearrc_1 > 1, 1, 0))
dataset$new_site_rs_1.other <- ifelse(dataset$new_site_rs_1 == 0 & dataset$cntryyearrs_1 > 0, 1, 
                                      ifelse(dataset$new_site_rs_1 == 1 & dataset$cntryyearrs_1 > 1, 1, 0))

dataset$new_site_rt_2.other <- ifelse(dataset$new_site_rt_2 == 0 & dataset$cntryyearrt_2 > 0, 1, 
                                      ifelse(dataset$new_site_rt_2 == 1 & dataset$cntryyearrt_2 > 1, 1, 0))
dataset$new_site_rc_2.other <- ifelse(dataset$new_site_rc_2 == 0 & dataset$cntryyearrc_2 > 0, 1, 
                                      ifelse(dataset$new_site_rc_2 == 1 & dataset$cntryyearrc_2 > 1, 1, 0))
dataset$new_site_rs_2.other <- ifelse(dataset$new_site_rs_2 == 0 & dataset$cntryyearrs_2 > 0, 1, 
                                      ifelse(dataset$new_site_rs_2 == 1 & dataset$cntryyearrs_2 > 1, 1, 0))

# Next, create indicator variables that capture cases where camps/settlements existed but
# were not "new", as previously defined:
dataset$no_new_site_rt_1 <- ifelse(dataset$rtb %in% 1 & dataset$new_site_rt_1 %in% 0, 1, 0)
dataset$no_new_site_rc_1 <- ifelse(dataset$rcb %in% 1 & dataset$new_site_rc_1 %in% 0, 1, 0)
dataset$no_new_site_rs_1 <- ifelse(dataset$rsb %in% 1 & dataset$new_site_rs_1 %in% 0, 1, 0)

dataset$no_new_site_rt_2 <- ifelse(dataset$rtb %in% 1 & dataset$new_site_rt_2 %in% 0, 1, 0)
dataset$no_new_site_rc_2 <- ifelse(dataset$rcb %in% 1 & dataset$new_site_rc_2 %in% 0, 1, 0)
dataset$no_new_site_rs_2 <- ifelse(dataset$rsb %in% 1 & dataset$new_site_rs_2 %in% 0, 1, 0)

# Next, create near border variable:
dataset$nearborder <- case_when(dataset$bdist3 <= 100~1, 
                                dataset$bdist3 > 100~0,
                                TRUE~NA_real_)

# Next, create capital distance indicator:
median(dataset$capdist, na.rm = T)

dataset$nearcapital <- case_when(dataset$capdist <= median(dataset$capdist, na.rm = T)~1, 
                                dataset$capdist > median(dataset$capdist, na.rm = T)~0,
                                TRUE~NA_real_)

# Create a general year variable:
dataset$year <- lubridate::year(dataset$Year)

# Next, output dataset in which we only include countries that have had refugee sites:
# Identify countries that hosted refugees:
temp <- dataset %>%
  group_by(GMI_CNTRY) %>%
  summarise(rtb = max(rtb))
temp <- temp[temp$rtb== 1,]
temp <- temp[['GMI_CNTRY']]

dataset.subset <- dataset[dataset$GMI_CNTRY %in% temp, ]

# Write out both datasets:
write.table(dataset, file="DataOutput/panel.full_GED_2020.csv", sep=",", row.names=F)
write.table(dataset.subset, file="DataOutput/panel.subset_GED_2020.csv", sep=",", row.names=F)

# Next, create dynamic subsets:
# Identify countries that hosted refugees during specific periods of time:
dynamic_subsets <- list()

for(i in 1:length(1990:2018)){
  temp_dataset <- dataset[dataset$Year <= ymd(paste(1989 + i, "01", "01", sep = "-")), ]
  temp <- temp_dataset %>%
    group_by(GMI_CNTRY) %>%
    summarise(rtb = max(rtb))
  temp <- temp[temp$rtb== 1,]
  temp <- temp[['GMI_CNTRY']]
  
  dynamic_subsets[[i]] <- dataset[dataset$GMI_CNTRY %in% temp & dataset$Year == ymd(paste(1989 + i, "01", "01", sep = "-")), ]
}

dynamic_subset <- rbindlist(dynamic_subsets)

# Write out datasets:
# write.table(dynamic_subset, file="panel.dynamic.subset_GED_2020.csv", sep=",", row.names=F)
write.table(dynamic_subset, 
            file="DataOutput/panel.dynamic.subset_GED_2020.csv", sep=",", row.names=F)


# Next, create dataset for placebo work:
# Write a function that produces dataset for treatment of interest
# (e.g. refugee camp, settlement, etc.):
# For full placebo window:
placebo <- function(x){
  # First, create a dataset of controls:
  treated <- unique(dataset[dataset[[x]] == 1, ]$GMI_ADMIN)
  dataset.controls <- dataset[!(dataset$GMI_ADMIN %in% treated), ]
  # Next, create a dataset of only treated units:
  dataset.treated <- dataset[dataset$GMI_ADMIN %in% treated, ]
  # Next, drop all provinces from treated dataset with refugees in 1990
  # (the first year of analysis) because they cannot be used in a placebo analysis:
  exclusion_1990 <- unique(dataset.treated[dataset.treated[[x]] == 1 &
                                             dataset.treated$Year == ymd("1990-01-01"), ]$GMI_ADMIN)
  dataset.treated <- dataset.treated[!(dataset.treated$GMI_ADMIN %in% exclusion_1990), ]
  # Next, create individual year datasets that map conflict to refugee settlements to
  # all years in the dataset before they were established:
  list1 <- list()
  list2 <- list()
  list3 <- list()
  for(i in 1:length(1990:2017)){
    temp <- unique(unlist(list1))
    # Identify all treated provinces by first year of treatment:
    list1[[i]] <- unique(dataset.treated[dataset.treated[[x]] == 1 & dataset.treated$Year == (ymd("1990-01-01") + years(i)), ]$GMI_ADMIN)
    # Subset dataset to include only those provinces:
    list2[[i]] <- dataset.treated[dataset.treated$GMI_ADMIN %in% list1[[i]], ]
    # Assign values of 1 to all pre-refugee years:
    list2[[i]][[x]] <- 1
    # Drop any previously identified treated provinces:
    list2[[i]] <- list2[[i]][!(list2[[i]]$GMI_ADMIN %in% temp), ]
    # Store observations for only the pre-refugee period:
    list2[[i]] <- list2[[i]][list2[[i]]$Year <= (ymd("1989-01-01") + years(i)), ]
    # Also store observations for only the pre-refugee period, excluding the year before any refugee presence is officially documented:
    list3[[i]] <- list2[[i]][list2[[i]]$Year <= (ymd("1989-01-01") + years(i-1)), ]
  }
  # # Full placebo window:
  # # Combine these files:
  dataset.placebo <- as.data.frame(rbindlist(list2))
  dataset.placebo <- rbind(dataset.placebo, dataset.controls)
  return(dataset.placebo)
}
dataset.placebo.rcb <- placebo("rcb")
dataset.placebo.rsb <- placebo("rsb")
dataset.placebo.rtb <- placebo("rtb")
# Rename treatment variables for clarity:
names(dataset.placebo.rcb)[names(dataset.placebo.rcb) == "rcb"] <- "rcb.placebo"
names(dataset.placebo.rsb)[names(dataset.placebo.rsb) == "rsb"] <- "rsb.placebo"
names(dataset.placebo.rtb)[names(dataset.placebo.rtb) == "rtb"] <- "rtb.placebo"
# Create a placebo and other interaction term:
dataset.placebo.rsb$rsb.placebo_rsb.other <- dataset.placebo.rsb$rsb.placebo *
  dataset.placebo.rsb$rsb.other
dataset.placebo.rcb$rcb.placebo_rcb.other <- dataset.placebo.rcb$rcb.placebo *
  dataset.placebo.rcb$rcb.other
dataset.placebo.rtb$rtb.placebo_rtb.other <- dataset.placebo.rtb$rtb.placebo *
  dataset.placebo.rtb$rtb.other
# Write out datasets:
write.table(dataset.placebo.rcb, file="DataOutput/panel.placebo.rcb_GED_2020.csv", sep=",",row.names=F)
write.table(dataset.placebo.rsb, file="DataOutput/panel.placebo.rsb_GED_2020.csv", sep=",",row.names=F)
write.table(dataset.placebo.rtb, file="DataOutput/panel.placebo.rtb_GED_2020.csv", sep=",",row.names=F)
# Next, for year - 1 exclusion:
placebo <- function(x){
  # First, create a dataset of controls:
  treated <- unique(dataset[dataset[[x]] == 1, ]$GMI_ADMIN)
  dataset.controls <- dataset[!(dataset$GMI_ADMIN %in% treated), ]
  # Next, create a dataset of only treated units:
  dataset.treated <- dataset[dataset$GMI_ADMIN %in% treated, ]
  # Next, drop all provinces from treated dataset with refugees in 1990
  # (the first year of analysis) because they cannot be used in a placebo analysis:
  exclusion_1990 <- unique(dataset.treated[dataset.treated[[x]] == 1 &
                                             dataset.treated$Year == ymd("1990-01-01"), ]$GMI_ADMIN)
  dataset.treated <- dataset.treated[!(dataset.treated$GMI_ADMIN %in% exclusion_1990), ]
  # Next, create individual year datasets that map conflict to refugee settlements to
  # all years in the dataset before they were established:
  list1 <- list()
  list2 <- list()
  list3 <- list()
  for(i in 1:length(1990:2017)){
    temp <- unique(unlist(list1))
    # Identify all treated provinces by first year of treatment:
    list1[[i]] <- unique(dataset.treated[dataset.treated[[x]] == 1 & dataset.treated$Year == (ymd("1990-01-01") + years(i)), ]$GMI_ADMIN)
    # Subset dataset to include only those provinces:
    list2[[i]] <- dataset.treated[dataset.treated$GMI_ADMIN %in% list1[[i]], ]
    # Assign values of 1 to all pre-refugee years:
    list2[[i]][[x]] <- 1
    # Drop any previously identified treated provinces:
    list2[[i]] <- list2[[i]][!(list2[[i]]$GMI_ADMIN %in% temp), ]
    # Store observations for only the pre-refugee period:
    list2[[i]] <- list2[[i]][list2[[i]]$Year <= (ymd("1989-01-01") + years(i)), ]
    # Also store observations for only the pre-refugee period, excluding the year before any refugee presence is officially documented:
    list3[[i]] <- list2[[i]][list2[[i]]$Year <= (ymd("1989-01-01") + years(i-1)), ]
  }
  # Placebo with year - 1 exclusion:
  # Combine these files:
  dataset.placebo.t1 <- as.data.frame(rbindlist(list3))
  dataset.placebo.t1 <- rbind(dataset.placebo.t1, dataset.controls)
  return(dataset.placebo.t1)
}
dataset.placebo.rcb.t1 <- placebo("rcb")
dataset.placebo.rsb.t1 <- placebo("rsb")
dataset.placebo.rtb.t1 <- placebo("rtb")
# Rename treatment variables for clarity:
names(dataset.placebo.rcb.t1)[names(dataset.placebo.rcb.t1) == "rcb"] <- "rcb.placebo"
names(dataset.placebo.rsb.t1)[names(dataset.placebo.rsb.t1) == "rsb"] <- "rsb.placebo"
names(dataset.placebo.rtb.t1)[names(dataset.placebo.rtb.t1) == "rtb"] <- "rtb.placebo"
# Create a placebo and other interaction term:
dataset.placebo.rsb.t1$rsb.placebo_rsb.other <- dataset.placebo.rsb.t1$rsb.placebo *
  dataset.placebo.rsb.t1$rsb.other
dataset.placebo.rcb.t1$rcb.placebo_rcb.other <- dataset.placebo.rcb.t1$rcb.placebo *
  dataset.placebo.rcb.t1$rcb.other
dataset.placebo.rtb.t1$rtb.placebo_rtb.other <- dataset.placebo.rtb.t1$rtb.placebo *
  dataset.placebo.rtb.t1$rtb.other
# Write out datasets:
write.table(dataset.placebo.rcb.t1, file="DataOutput/panel.placebo.rcb.t1_GED_2020.csv", sep=",",row.names=F)
write.table(dataset.placebo.rsb.t1, file="DataOutput/panel.placebo.rsb.t1_GED_2020.csv", sep=",",row.names=F)
write.table(dataset.placebo.rtb.t1, file="DataOutput/panel.placebo.rtb.t1_GED_2020.csv", sep=",",row.names=F)

# Now, return datasets of countries that includes only those that received refugees:
subset <- function(x, y){
  temp <- dataset %>%
    group_by(GMI_CNTRY) %>%
    summarise(rcb = max(rcb), rsb = max(rsb), rtb = max(rtb))
  temp <- temp[temp[[x]] == 1,]
  temp <- temp[['GMI_CNTRY']]
  dataset.placebo.subset <- y[y$GMI_CNTRY %in% temp, ]
  return(dataset.placebo.subset)
}

dataset.placebo.rcb.subset <- subset("rcb", dataset.placebo.rcb)
dataset.placebo.rsb.subset <- subset("rsb", dataset.placebo.rsb)
dataset.placebo.rtb.subset <- subset("rtb", dataset.placebo.rtb)

dataset.placebo.rcb.t1.subset <- subset("rcb", dataset.placebo.rcb.t1)
dataset.placebo.rsb.t1.subset <- subset("rsb", dataset.placebo.rsb.t1)
dataset.placebo.rtb.t1.subset <- subset("rtb", dataset.placebo.rtb.t1)

# Rename treatment variables for clarity:
names(dataset.placebo.rcb.subset)[names(dataset.placebo.rcb.subset) == "rcb"] <- "rcb.placebo"
names(dataset.placebo.rsb.subset)[names(dataset.placebo.rsb.subset) == "rsb"] <- "rsb.placebo"
names(dataset.placebo.rtb.subset)[names(dataset.placebo.rtb.subset) == "rtb"] <- "rtb.placebo"

names(dataset.placebo.rcb.t1.subset)[names(dataset.placebo.rcb.t1.subset) == "rcb"] <- "rcb.placebo"
names(dataset.placebo.rsb.t1.subset)[names(dataset.placebo.rsb.t1.subset) == "rsb"] <- "rsb.placebo"
names(dataset.placebo.rtb.t1.subset)[names(dataset.placebo.rtb.t1.subset) == "rtb"] <- "rtb.placebo"

# Write out datasets:
write.table(dataset.placebo.rcb.subset, file="DataOutput/panel.placebo.rcb.subset_GED_2020.csv", sep=",",row.names=F)
write.table(dataset.placebo.rsb.subset, file="DataOutput/panel.placebo.rsb.subset_GED_2020.csv", sep=",",row.names=F)
write.table(dataset.placebo.rtb.subset, file="DataOutput/panel.placebo.rtb.subset_GED_2020.csv", sep=",",row.names=F)

write.table(dataset.placebo.rcb.t1.subset, file="DataOutput/panel.placebo.rcb.t1.subset_GED_2020.csv", sep=",",row.names=F)
write.table(dataset.placebo.rsb.t1.subset, file="DataOutput/panel.placebo.rsb.t1.subset_GED_2020.csv", sep=",",row.names=F)
write.table(dataset.placebo.rtb.t1.subset, file="DataOutput/panel.placebo.rtb.t1.subset_GED_2020.csv", sep=",",row.names=F)

# Next, create dynamic subsets for placebo datasets:
# Identify countries that hosted refugees during specific periods of time:
placebo_dynamic_subset_function <- function(x){
  placebo_dynamic_subsets <- list()
  for(i in 1:length(1990:2018)){
    temp_dataset <- dataset[dataset$Year <= ymd(paste(1989 + i, "01", "01", sep = "-")), ]
    temp <- temp_dataset %>%
      group_by(GMI_CNTRY) %>%
      summarise(rtb = max(rtb))
    temp <- temp[temp$rtb== 1,]
    temp <- temp[['GMI_CNTRY']]
    placebo_dynamic_subsets[[i]] <- x[x$GMI_CNTRY %in% temp & x$Year == ymd(paste(1989 + i, "01", "01", sep = "-")), ]
  }
  placebo_dynamic_subset <- rbindlist(placebo_dynamic_subsets)
  return(placebo_dynamic_subset)
}

dataset.placebo.dynamic.subset.rcb <- placebo_dynamic_subset_function(dataset.placebo.rcb)
dataset.placebo.dynamic.subset.rsb <- placebo_dynamic_subset_function(dataset.placebo.rsb)
dataset.placebo.dynamic.subset.rtb <- placebo_dynamic_subset_function(dataset.placebo.rtb)

# Write out datasets:
write.table(dataset.placebo.dynamic.subset.rcb, file="DataOutput/panel.placebo.dynamic.subset.rcb_GED_2020.csv", sep=",",row.names=F)
write.table(dataset.placebo.dynamic.subset.rsb, file="DataOutput/panel.placebo.dynamic.subset.rsb_GED_2020.csv", sep=",",row.names=F)
write.table(dataset.placebo.dynamic.subset.rtb, file="DataOutput/panel.placebo.dynamic.subset.rtb_GED_2020.csv", sep=",",row.names=F)

dataset.placebo.dynamic.subset.rcb.t1 <- placebo_dynamic_subset_function(dataset.placebo.rcb.t1)
dataset.placebo.dynamic.subset.rsb.t1 <- placebo_dynamic_subset_function(dataset.placebo.rsb.t1)
dataset.placebo.dynamic.subset.rtb.t1 <- placebo_dynamic_subset_function(dataset.placebo.rtb.t1)

# Write out datasets:
write.table(dataset.placebo.dynamic.subset.rcb.t1, file="DataOutput/panel.placebo.dynamic.subset.rcb.t1_GED_2020.csv", sep=",",row.names=F)
write.table(dataset.placebo.dynamic.subset.rsb.t1, file="DataOutput/panel.placebo.dynamic.subset.rsb.t1_GED_2020.csv", sep=",",row.names=F)
write.table(dataset.placebo.dynamic.subset.rtb.t1, file="DataOutput/panel.placebo.dynamic.subset.rtb.t1_GED_2020.csv", sep=",",row.names=F)

# Write out camps and settlements dataset:
write.table(camps_settlements, file="DataOutput/camps_settlements_processed.csv", sep=",",row.names=F)

###
### END.
###
